(fix) git clone in top 100 python repositories and change parser rule
Showing
3 changed files
with
165 additions
and
73 deletions
... | @@ -14,10 +14,8 @@ | ... | @@ -14,10 +14,8 @@ |
14 | 14 | ||
15 | import os | 15 | import os |
16 | import git | 16 | import git |
17 | -import json | ||
18 | import argparse | 17 | import argparse |
19 | from git import Repo | 18 | from git import Repo |
20 | -from tqdm import tqdm | ||
21 | from time import sleep | 19 | from time import sleep |
22 | from queue import Queue | 20 | from queue import Queue |
23 | from threading import Thread | 21 | from threading import Thread |
... | @@ -55,7 +53,7 @@ class ClonePooler(object): | ... | @@ -55,7 +53,7 @@ class ClonePooler(object): |
55 | ) | 53 | ) |
56 | sleep(0.1) | 54 | sleep(0.1) |
57 | self.count += 1 | 55 | self.count += 1 |
58 | - print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}") | 56 | + print(f"{self.count}/{self.total_repos} {format((self.count/self.total_repos) * 100, '.2f')}") |
59 | except git.exc.InvalidGitRepositoryError: | 57 | except git.exc.InvalidGitRepositoryError: |
60 | print(f'{repo} is not found.') | 58 | print(f'{repo} is not found.') |
61 | except git.exc.GitError as e: | 59 | except git.exc.GitError as e: |
... | @@ -65,11 +63,10 @@ def main(args): | ... | @@ -65,11 +63,10 @@ def main(args): |
65 | 63 | ||
66 | os.makedirs(args.repos_dir, exist_ok=True) | 64 | os.makedirs(args.repos_dir, exist_ok=True) |
67 | repos = set() | 65 | repos = set() |
68 | - with open(args.jsonl_file, encoding="utf-8") as f: | 66 | + with open(args.repositories, encoding="utf-8") as f: |
69 | for idx, line in enumerate(f): | 67 | for idx, line in enumerate(f): |
70 | line = line.strip() | 68 | line = line.strip() |
71 | - js = json.loads(line) | 69 | + repos.add(line.replace('https://github.com/', '')) |
72 | - repos.add(js['repo']) | ||
73 | 70 | ||
74 | pooler = ClonePooler( | 71 | pooler = ClonePooler( |
75 | total_repos=len(repos) | 72 | total_repos=len(repos) |
... | @@ -80,8 +77,8 @@ def main(args): | ... | @@ -80,8 +77,8 @@ def main(args): |
80 | 77 | ||
81 | if __name__ == '__main__': | 78 | if __name__ == '__main__': |
82 | parser = argparse.ArgumentParser(description="") | 79 | parser = argparse.ArgumentParser(description="") |
83 | - parser.add_argument("--jsonl_file", type=str, required=True, | 80 | + parser.add_argument("--repositories", type=str, required=True, |
84 | - help="jsonl file path.") | 81 | + help="repositories file path.") |
85 | parser.add_argument("--repos_dir", type=str, required=True, | 82 | parser.add_argument("--repos_dir", type=str, required=True, |
86 | help="directory that all repositories will be downloaded.") | 83 | help="directory that all repositories will be downloaded.") |
87 | parser.add_argument("--num_worker_threads", type=int, default=16, | 84 | parser.add_argument("--num_worker_threads", type=int, default=16, | ... | ... |
... | @@ -13,6 +13,7 @@ | ... | @@ -13,6 +13,7 @@ |
13 | # limitations under the License. | 13 | # limitations under the License. |
14 | 14 | ||
15 | import os | 15 | import os |
16 | +import re | ||
16 | import json | 17 | import json |
17 | import jsonlines | 18 | import jsonlines |
18 | import argparse | 19 | import argparse |
... | @@ -23,91 +24,85 @@ from multiprocessing.pool import Pool | ... | @@ -23,91 +24,85 @@ from multiprocessing.pool import Pool |
23 | from transformers import RobertaTokenizer | 24 | from transformers import RobertaTokenizer |
24 | from pydriller import GitRepository, RepositoryMining | 25 | from pydriller import GitRepository, RepositoryMining |
25 | 26 | ||
27 | +def message_cleaner(message): | ||
28 | + msg = message.split("\n")[0] | ||
29 | + msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg) | ||
30 | + return msg | ||
26 | 31 | ||
27 | -def jobs(repo_paths, args): | ||
28 | - repo, paths = repo_paths | ||
29 | - repo_path = os.path.join(args.repos_dir, repo) | ||
30 | 32 | ||
33 | +def jobs(repo, args): | ||
34 | + repo_path = os.path.join(args.repos_dir, repo) | ||
31 | if os.path.exists(repo_path): | 35 | if os.path.exists(repo_path): |
32 | - gr = GitRepository(repo_path) | 36 | + for commit in RepositoryMining( |
33 | - | 37 | + repo_path, only_modifications_with_file_types=['.py'] |
34 | - for path in paths: | 38 | + ).traverse_commits(): |
35 | - commits = gr.get_commits_modified_file(path) | 39 | + cleaned_message = message_cleaner(commit.msg) |
36 | - for commit in RepositoryMining( | 40 | + tokenized_message = args.tokenizer.tokenize(cleaned_message) |
37 | - repo_path, only_commits=commits | 41 | + if len(tokenized_message) > args.max_target_length: |
38 | - ).traverse_commits(): | 42 | + continue |
39 | - message = (commit.msg).split("\n")[0] | 43 | + |
44 | + for mod in commit.modifications: | ||
45 | + if not (mod.old_path and mod.new_path): | ||
46 | + continue | ||
47 | + if os.path.splitext(mod.new_path)[1] != '.py': | ||
48 | + continue | ||
49 | + if not mod.diff_parsed["added"]: | ||
50 | + continue | ||
51 | + if not mod.diff_parsed["deleted"]: | ||
52 | + continue | ||
40 | 53 | ||
41 | added, deleted = [], [] | 54 | added, deleted = [], [] |
42 | - for mod in commit.modifications: | 55 | + |
43 | - if mod.new_path == path: | 56 | + for line, code in mod.diff_parsed["added"]: |
44 | - for line, code in mod.diff_parsed["added"]: | 57 | + added.extend(args.tokenizer.tokenize(code)) |
45 | - added += args.tokenizer.tokenize(code) | 58 | + |
46 | - assert isinstance(added, list) | 59 | + for line, code in mod.diff_parsed["deleted"]: |
47 | - | 60 | + deleted.extend(args.tokenizer.tokenize(code)) |
48 | - for line, code in mod.diff_parsed["deleted"]: | 61 | + |
49 | - deleted += args.tokenizer.tokenize(code) | 62 | + if len(added) + len(deleted) <= args.max_source_length: |
50 | - assert isinstance(deleted, list) | 63 | + with jsonlines.open(args.output_file, mode="a") as writer: |
51 | - | 64 | + writer.write( |
52 | - with jsonlines.open(args.output_file, mode="a") as writer: | 65 | + { |
53 | - writer.write( | 66 | + "msg": tokenized_message, |
54 | - { | 67 | + "added": added, |
55 | - "repo": repo, | 68 | + "deleted": deleted, |
56 | - "path": path, | 69 | + } |
57 | - "sha": commit.hash, | 70 | + ) |
58 | - "msg": args.tokenizer.tokenize(message), | ||
59 | - "added": added, | ||
60 | - "deleted": deleted, | ||
61 | - } | ||
62 | - ) | ||
63 | 71 | ||
64 | def main(args): | 72 | def main(args): |
65 | - repos = defaultdict(list) | 73 | + repos = set() |
66 | - with open(args.jsonl_file, encoding="utf-8") as f: | 74 | + with open(args.repositories, encoding="utf-8") as f: |
67 | for idx, line in enumerate(f): | 75 | for idx, line in enumerate(f): |
68 | line = line.strip() | 76 | line = line.strip() |
69 | - js = json.loads(line) | 77 | + repos.add(line.replace('https://github.com/', '')) |
70 | - repos[js["repo"]].append(js["path"]) | ||
71 | 78 | ||
72 | os.makedirs(args.output_dir, exist_ok=True) | 79 | os.makedirs(args.output_dir, exist_ok=True) |
73 | - args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file)) | 80 | + args.output_file = os.path.join(args.output_dir, 'dataset.jsonl') |
74 | 81 | ||
75 | func = partial(jobs, args=args) | 82 | func = partial(jobs, args=args) |
76 | with Pool(processes=args.num_workers) as pool: | 83 | with Pool(processes=args.num_workers) as pool: |
77 | with tqdm(total=len(repos)) as pbar: | 84 | with tqdm(total=len(repos)) as pbar: |
78 | - for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))): | 85 | + for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos))): |
79 | pbar.update() | 86 | pbar.update() |
80 | 87 | ||
81 | 88 | ||
82 | if __name__ == "__main__": | 89 | if __name__ == "__main__": |
83 | parser = argparse.ArgumentParser(description="") | 90 | parser = argparse.ArgumentParser(description="") |
84 | - parser.add_argument( | 91 | + parser.add_argument("--repositories", type=str, required=True, |
85 | - "--jsonl_file", type=str, required=True, help="jsonl file path." | 92 | + help="repositories file path.") |
86 | - ) | 93 | + parser.add_argument("--repos_dir", type=str, required=True, |
87 | - parser.add_argument( | 94 | + help="directory that all repositories had been downloaded.",) |
88 | - "--repos_dir", | 95 | + parser.add_argument("--output_dir", type=str, required=True, |
89 | - type=str, | 96 | + help="The output directory where the preprocessed data will be written.") |
90 | - required=True, | 97 | + parser.add_argument("--tokenizer_name", type=str, |
91 | - help="directory that all repositories will be downloaded.", | 98 | + default="microsoft/codebert-base", help="The name of tokenizer",) |
92 | - ) | 99 | + parser.add_argument("--num_workers", default=4, type=int, help="number of process") |
93 | - parser.add_argument( | 100 | + parser.add_argument("--max_source_length", default=256, type=int, |
94 | - "--output_dir", | 101 | + help="The maximum total source sequence length after tokenization. Sequences longer " |
95 | - type=str, | 102 | + "than this will be truncated, sequences shorter will be padded.") |
96 | - required=True, | 103 | + parser.add_argument("--max_target_length", default=128, type=int, |
97 | - help="The output directory where the preprocessed data will be written.", | 104 | + help="The maximum total target sequence length after tokenization. Sequences longer " |
98 | - ) | 105 | + "than this will be truncated, sequences shorter will be padded.") |
99 | - parser.add_argument( | ||
100 | - "--tokenizer_name", | ||
101 | - type=str, | ||
102 | - default="microsoft/codebert-base", | ||
103 | - help="The name of tokenizer", | ||
104 | - ) | ||
105 | - parser.add_argument( | ||
106 | - "--num_workers", | ||
107 | - default=4, | ||
108 | - type=int, | ||
109 | - help="number of process", | ||
110 | - ) | ||
111 | 106 | ||
112 | args = parser.parse_args() | 107 | args = parser.parse_args() |
113 | 108 | ... | ... |
repositories.txt
0 → 100644
1 | +https://github.com/donnemartin/system-design-primer | ||
2 | +https://github.com/public-apis/public-apis | ||
3 | +https://github.com/TheAlgorithms/Python | ||
4 | +https://github.com/vinta/awesome-python | ||
5 | +https://github.com/tensorflow/models | ||
6 | +https://github.com/nvbn/thefuck | ||
7 | +https://github.com/django/django | ||
8 | +https://github.com/pallets/flask | ||
9 | +https://github.com/httpie/httpie | ||
10 | +https://github.com/josephmisiti/awesome-machine-learning | ||
11 | +https://github.com/ansible/ansible | ||
12 | +https://github.com/psf/requests | ||
13 | +https://github.com/scikit-learn/scikit-learn | ||
14 | +https://github.com/scrapy/scrapy | ||
15 | +https://github.com/minimaxir/big-list-of-naughty-strings | ||
16 | +https://github.com/ageitgey/face_recognition | ||
17 | +https://github.com/home-assistant/core | ||
18 | +https://github.com/soimort/you-get | ||
19 | +https://github.com/huggingface/transformers | ||
20 | +https://github.com/deepfakes/faceswap | ||
21 | +https://github.com/apache/incubator-superset | ||
22 | +https://github.com/XX-net/XX-Net | ||
23 | +https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap | ||
24 | +https://github.com/certbot/certbot | ||
25 | +https://github.com/pandas-dev/pandas | ||
26 | +https://github.com/localstack/localstack | ||
27 | +https://github.com/getsentry/sentry | ||
28 | +https://github.com/3b1b/manim | ||
29 | +https://github.com/faif/python-patterns | ||
30 | +https://github.com/google-research/bert | ||
31 | +https://github.com/facebookresearch/Detectron | ||
32 | +https://github.com/openai/gym | ||
33 | +https://github.com/tiangolo/fastapi | ||
34 | +https://github.com/ycm-core/YouCompleteMe | ||
35 | +https://github.com/0xAX/linux-insides | ||
36 | +https://github.com/satwikkansal/wtfpython | ||
37 | +https://github.com/pypa/pipenv | ||
38 | +https://github.com/CorentinJ/Real-Time-Voice-Cloning | ||
39 | +https://github.com/donnemartin/interactive-coding-challenges | ||
40 | +https://github.com/docker/compose | ||
41 | +https://github.com/iperov/DeepFaceLab | ||
42 | +https://github.com/mitmproxy/mitmproxy | ||
43 | +https://github.com/donnemartin/data-science-ipython-notebooks | ||
44 | +https://github.com/tornadoweb/tornado | ||
45 | +https://github.com/chubin/cheat.sh | ||
46 | +https://github.com/trailofbits/algo | ||
47 | +https://github.com/geekcomputers/Python | ||
48 | +https://github.com/encode/django-rest-framework | ||
49 | +https://github.com/d2l-ai/d2l-zh | ||
50 | +https://github.com/apache/airflow | ||
51 | +https://github.com/matterport/Mask_RCNN | ||
52 | +https://github.com/swisskyrepo/PayloadsAllTheThings | ||
53 | +https://github.com/yunjey/pytorch-tutorial | ||
54 | +https://github.com/sqlmapproject/sqlmap | ||
55 | +https://github.com/psf/black | ||
56 | +https://github.com/eriklindernoren/ML-From-Scratch | ||
57 | +https://github.com/keon/algorithms | ||
58 | +https://github.com/google/python-fire | ||
59 | +https://github.com/explosion/spaCy | ||
60 | +https://github.com/drduh/macOS-Security-and-Privacy-Guide | ||
61 | +https://github.com/nicolargo/glances | ||
62 | +https://github.com/sebastianruder/NLP-progress | ||
63 | +https://github.com/StevenBlack/hosts | ||
64 | +https://github.com/tqdm/tqdm | ||
65 | +https://github.com/celery/celery | ||
66 | +https://github.com/magenta/magenta | ||
67 | +https://github.com/gto76/python-cheatsheet | ||
68 | +https://github.com/reddit-archive/reddit | ||
69 | +https://github.com/numpy/numpy | ||
70 | +https://github.com/sherlock-project/sherlock | ||
71 | +https://github.com/instillai/TensorFlow-Course | ||
72 | +https://github.com/charlax/professional-programming | ||
73 | +https://github.com/binux/pyspider | ||
74 | +https://github.com/ipython/ipython | ||
75 | +https://github.com/deezer/spleeter | ||
76 | +https://github.com/pytorch/examples | ||
77 | +https://github.com/toml-lang/toml | ||
78 | +https://github.com/luong-komorebi/Awesome-Linux-Software | ||
79 | +https://github.com/bokeh/bokeh | ||
80 | +https://github.com/bitcoinbook/bitcoinbook | ||
81 | +https://github.com/locustio/locust | ||
82 | +https://github.com/nginx-proxy/nginx-proxy | ||
83 | +https://github.com/microsoft/cascadia-code | ||
84 | +https://github.com/OWASP/CheatSheetSeries | ||
85 | +https://github.com/spotify/luigi | ||
86 | +https://github.com/cool-RR/PySnooper | ||
87 | +https://github.com/ray-project/ray | ||
88 | +https://github.com/openai/gpt-2 | ||
89 | +https://github.com/willmcgugan/rich | ||
90 | +https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix | ||
91 | +https://github.com/facebookresearch/detectron2 | ||
92 | +https://github.com/plotly/dash | ||
93 | +https://github.com/PaddlePaddle/Paddle | ||
94 | +https://github.com/cookiecutter/cookiecutter | ||
95 | +https://github.com/chubin/wttr.in | ||
96 | +https://github.com/zulip/zulip | ||
97 | +https://github.com/python-poetry/poetry | ||
98 | +https://github.com/fabric/fabric | ||
99 | +https://github.com/matplotlib/matplotlib | ||
100 | +https://github.com/tzutalin/labelImg | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment