Showing
1 changed file
with
92 additions
and
0 deletions
gitcloner.py
0 → 100644
| 1 | +# Copyright 2020-present Tae Hwan Jung | ||
| 2 | +# | ||
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); | ||
| 4 | +# you may not use this file except in compliance with the License. | ||
| 5 | +# You may obtain a copy of the License at | ||
| 6 | +# | ||
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 | ||
| 8 | +# | ||
| 9 | +# Unless required by applicable law or agreed to in writing, software | ||
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, | ||
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| 12 | +# See the License for the specific language governing permissions and | ||
| 13 | +# limitations under the License. | ||
| 14 | + | ||
| 15 | +import os | ||
| 16 | +import git | ||
| 17 | +import json | ||
| 18 | +import argparse | ||
| 19 | +from git import Repo | ||
| 20 | +from tqdm import tqdm | ||
| 21 | +from time import sleep | ||
| 22 | +from queue import Queue | ||
| 23 | +from threading import Thread | ||
| 24 | + | ||
| 25 | +class ClonePooler(object): | ||
| 26 | + def __init__(self, total_repos): | ||
| 27 | + self.count = 0 | ||
| 28 | + self.total_repos = total_repos | ||
| 29 | + self._queue = Queue() | ||
| 30 | + self.num_worker_threads = args.num_worker_threads | ||
| 31 | + self.repos_dir = args.repos_dir | ||
| 32 | + | ||
| 33 | + for i in range(self.num_worker_threads): | ||
| 34 | + _thread = Thread(target=self._worker) | ||
| 35 | + _thread.daemon = True | ||
| 36 | + _thread.start() | ||
| 37 | + | ||
| 38 | + def _worker(self): | ||
| 39 | + while True: | ||
| 40 | + repos = self._queue.get() | ||
| 41 | + self.do_job(repos) | ||
| 42 | + self._queue.task_done() | ||
| 43 | + | ||
| 44 | + def set_queue(self, repos): | ||
| 45 | + self._queue.put(repos) | ||
| 46 | + | ||
| 47 | + def join_queue(self): | ||
| 48 | + self._queue.join() | ||
| 49 | + | ||
| 50 | + def do_job(self, repo): | ||
| 51 | + try: | ||
| 52 | + Repo.clone_from( | ||
| 53 | + f'https://:@github.com/{repo}.git', | ||
| 54 | + f'{self.repos_dir}/{repo}' | ||
| 55 | + ) | ||
| 56 | + sleep(0.1) | ||
| 57 | + self.count += 1 | ||
| 58 | + print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}") | ||
| 59 | + except git.exc.InvalidGitRepositoryError: | ||
| 60 | + print(f'{repo} is not found.') | ||
| 61 | + except git.exc.GitError as e: | ||
| 62 | + print(e) | ||
| 63 | + | ||
| 64 | +def main(args): | ||
| 65 | + | ||
| 66 | + os.makedirs(args.repos_dir, exist_ok=True) | ||
| 67 | + repos = set() | ||
| 68 | + with open(args.jsonl_file, encoding="utf-8") as f: | ||
| 69 | + for idx, line in enumerate(f): | ||
| 70 | + line = line.strip() | ||
| 71 | + js = json.loads(line) | ||
| 72 | + repos.add(js['repo']) | ||
| 73 | + | ||
| 74 | + pooler = ClonePooler( | ||
| 75 | + total_repos=len(repos) | ||
| 76 | + ) | ||
| 77 | + for repo in repos: | ||
| 78 | + pooler.set_queue(repo) | ||
| 79 | + pooler.join_queue() | ||
| 80 | + | ||
| 81 | +if __name__ == '__main__': | ||
| 82 | + parser = argparse.ArgumentParser(description="") | ||
| 83 | + parser.add_argument("--jsonl_file", type=str, required=True, | ||
| 84 | + help="jsonl file path.") | ||
| 85 | + parser.add_argument("--repos_dir", type=str, required=True, | ||
| 86 | + help="directory that all repositories will be downloaded.") | ||
| 87 | + parser.add_argument("--num_worker_threads", type=int, default=16, | ||
| 88 | + help="number of threads in a worker") | ||
| 89 | + | ||
| 90 | + args = parser.parse_args() | ||
| 91 | + | ||
| 92 | + main(args) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment