gitcloner.py
2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Copyright 2020-present Tae Hwan Jung
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import git
import argparse
from git import Repo
from time import sleep
from queue import Queue
from threading import Thread
class ClonePooler(object):
def __init__(self, total_repos):
self.count = 0
self.total_repos = total_repos
self._queue = Queue()
self.num_worker_threads = args.num_worker_threads
self.repos_dir = args.repos_dir
for i in range(self.num_worker_threads):
_thread = Thread(target=self._worker)
_thread.daemon = True
_thread.start()
def _worker(self):
while True:
repos = self._queue.get()
self.do_job(repos)
self._queue.task_done()
def set_queue(self, repos):
self._queue.put(repos)
def join_queue(self):
self._queue.join()
def do_job(self, repo):
try:
Repo.clone_from(
f'https://:@github.com/{repo}.git',
f'{self.repos_dir}/{repo}'
)
sleep(0.1)
self.count += 1
print(f"{self.count}/{self.total_repos} {format((self.count/self.total_repos) * 100, '.2f')}")
except git.exc.InvalidGitRepositoryError:
print(f'{repo} is not found.')
except git.exc.GitError as e:
print(e)
def main(args):
os.makedirs(args.repos_dir, exist_ok=True)
repos = set()
with open(args.repositories, encoding="utf-8") as f:
for idx, line in enumerate(f):
line = line.strip()
repos.add(line.replace('https://github.com/', ''))
pooler = ClonePooler(
total_repos=len(repos)
)
for repo in repos:
pooler.set_queue(repo)
pooler.join_queue()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="")
parser.add_argument("--repositories", type=str, required=True,
help="repositories file path.")
parser.add_argument("--repos_dir", type=str, required=True,
help="directory that all repositories will be downloaded.")
parser.add_argument("--num_worker_threads", type=int, default=16,
help="number of threads in a worker")
args = parser.parse_args()
main(args)