graykode

(fix) git clone in top 100 python repositories and change parser rule

...@@ -14,10 +14,8 @@ ...@@ -14,10 +14,8 @@
14 14
15 import os 15 import os
16 import git 16 import git
17 -import json
18 import argparse 17 import argparse
19 from git import Repo 18 from git import Repo
20 -from tqdm import tqdm
21 from time import sleep 19 from time import sleep
22 from queue import Queue 20 from queue import Queue
23 from threading import Thread 21 from threading import Thread
...@@ -55,7 +53,7 @@ class ClonePooler(object): ...@@ -55,7 +53,7 @@ class ClonePooler(object):
55 ) 53 )
56 sleep(0.1) 54 sleep(0.1)
57 self.count += 1 55 self.count += 1
58 - print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}") 56 + print(f"{self.count}/{self.total_repos} {format((self.count/self.total_repos) * 100, '.2f')}")
59 except git.exc.InvalidGitRepositoryError: 57 except git.exc.InvalidGitRepositoryError:
60 print(f'{repo} is not found.') 58 print(f'{repo} is not found.')
61 except git.exc.GitError as e: 59 except git.exc.GitError as e:
...@@ -65,11 +63,10 @@ def main(args): ...@@ -65,11 +63,10 @@ def main(args):
65 63
66 os.makedirs(args.repos_dir, exist_ok=True) 64 os.makedirs(args.repos_dir, exist_ok=True)
67 repos = set() 65 repos = set()
68 - with open(args.jsonl_file, encoding="utf-8") as f: 66 + with open(args.repositories, encoding="utf-8") as f:
69 for idx, line in enumerate(f): 67 for idx, line in enumerate(f):
70 line = line.strip() 68 line = line.strip()
71 - js = json.loads(line) 69 + repos.add(line.replace('https://github.com/', ''))
72 - repos.add(js['repo'])
73 70
74 pooler = ClonePooler( 71 pooler = ClonePooler(
75 total_repos=len(repos) 72 total_repos=len(repos)
...@@ -80,8 +77,8 @@ def main(args): ...@@ -80,8 +77,8 @@ def main(args):
80 77
81 if __name__ == '__main__': 78 if __name__ == '__main__':
82 parser = argparse.ArgumentParser(description="") 79 parser = argparse.ArgumentParser(description="")
83 - parser.add_argument("--jsonl_file", type=str, required=True, 80 + parser.add_argument("--repositories", type=str, required=True,
84 - help="jsonl file path.") 81 + help="repositories file path.")
85 parser.add_argument("--repos_dir", type=str, required=True, 82 parser.add_argument("--repos_dir", type=str, required=True,
86 help="directory that all repositories will be downloaded.") 83 help="directory that all repositories will be downloaded.")
87 parser.add_argument("--num_worker_threads", type=int, default=16, 84 parser.add_argument("--num_worker_threads", type=int, default=16,
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
13 # limitations under the License. 13 # limitations under the License.
14 14
15 import os 15 import os
16 +import re
16 import json 17 import json
17 import jsonlines 18 import jsonlines
18 import argparse 19 import argparse
...@@ -23,91 +24,85 @@ from multiprocessing.pool import Pool ...@@ -23,91 +24,85 @@ from multiprocessing.pool import Pool
23 from transformers import RobertaTokenizer 24 from transformers import RobertaTokenizer
24 from pydriller import GitRepository, RepositoryMining 25 from pydriller import GitRepository, RepositoryMining
25 26
27 +def message_cleaner(message):
28 + msg = message.split("\n")[0]
29 + msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
30 + return msg
26 31
27 -def jobs(repo_paths, args):
28 - repo, paths = repo_paths
29 - repo_path = os.path.join(args.repos_dir, repo)
30 32
33 +def jobs(repo, args):
34 + repo_path = os.path.join(args.repos_dir, repo)
31 if os.path.exists(repo_path): 35 if os.path.exists(repo_path):
32 - gr = GitRepository(repo_path) 36 + for commit in RepositoryMining(
33 - 37 + repo_path, only_modifications_with_file_types=['.py']
34 - for path in paths: 38 + ).traverse_commits():
35 - commits = gr.get_commits_modified_file(path) 39 + cleaned_message = message_cleaner(commit.msg)
36 - for commit in RepositoryMining( 40 + tokenized_message = args.tokenizer.tokenize(cleaned_message)
37 - repo_path, only_commits=commits 41 + if len(tokenized_message) > args.max_target_length:
38 - ).traverse_commits(): 42 + continue
39 - message = (commit.msg).split("\n")[0] 43 +
44 + for mod in commit.modifications:
45 + if not (mod.old_path and mod.new_path):
46 + continue
47 + if os.path.splitext(mod.new_path)[1] != '.py':
48 + continue
49 + if not mod.diff_parsed["added"]:
50 + continue
51 + if not mod.diff_parsed["deleted"]:
52 + continue
40 53
41 added, deleted = [], [] 54 added, deleted = [], []
42 - for mod in commit.modifications: 55 +
43 - if mod.new_path == path: 56 + for line, code in mod.diff_parsed["added"]:
44 - for line, code in mod.diff_parsed["added"]: 57 + added.extend(args.tokenizer.tokenize(code))
45 - added += args.tokenizer.tokenize(code) 58 +
46 - assert isinstance(added, list) 59 + for line, code in mod.diff_parsed["deleted"]:
47 - 60 + deleted.extend(args.tokenizer.tokenize(code))
48 - for line, code in mod.diff_parsed["deleted"]: 61 +
49 - deleted += args.tokenizer.tokenize(code) 62 + if len(added) + len(deleted) <= args.max_source_length:
50 - assert isinstance(deleted, list) 63 + with jsonlines.open(args.output_file, mode="a") as writer:
51 - 64 + writer.write(
52 - with jsonlines.open(args.output_file, mode="a") as writer: 65 + {
53 - writer.write( 66 + "msg": tokenized_message,
54 - { 67 + "added": added,
55 - "repo": repo, 68 + "deleted": deleted,
56 - "path": path, 69 + }
57 - "sha": commit.hash, 70 + )
58 - "msg": args.tokenizer.tokenize(message),
59 - "added": added,
60 - "deleted": deleted,
61 - }
62 - )
63 71
64 def main(args): 72 def main(args):
65 - repos = defaultdict(list) 73 + repos = set()
66 - with open(args.jsonl_file, encoding="utf-8") as f: 74 + with open(args.repositories, encoding="utf-8") as f:
67 for idx, line in enumerate(f): 75 for idx, line in enumerate(f):
68 line = line.strip() 76 line = line.strip()
69 - js = json.loads(line) 77 + repos.add(line.replace('https://github.com/', ''))
70 - repos[js["repo"]].append(js["path"])
71 78
72 os.makedirs(args.output_dir, exist_ok=True) 79 os.makedirs(args.output_dir, exist_ok=True)
73 - args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file)) 80 + args.output_file = os.path.join(args.output_dir, 'dataset.jsonl')
74 81
75 func = partial(jobs, args=args) 82 func = partial(jobs, args=args)
76 with Pool(processes=args.num_workers) as pool: 83 with Pool(processes=args.num_workers) as pool:
77 with tqdm(total=len(repos)) as pbar: 84 with tqdm(total=len(repos)) as pbar:
78 - for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))): 85 + for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos))):
79 pbar.update() 86 pbar.update()
80 87
81 88
82 if __name__ == "__main__": 89 if __name__ == "__main__":
83 parser = argparse.ArgumentParser(description="") 90 parser = argparse.ArgumentParser(description="")
84 - parser.add_argument( 91 + parser.add_argument("--repositories", type=str, required=True,
85 - "--jsonl_file", type=str, required=True, help="jsonl file path." 92 + help="repositories file path.")
86 - ) 93 + parser.add_argument("--repos_dir", type=str, required=True,
87 - parser.add_argument( 94 + help="directory that all repositories had been downloaded.",)
88 - "--repos_dir", 95 + parser.add_argument("--output_dir", type=str, required=True,
89 - type=str, 96 + help="The output directory where the preprocessed data will be written.")
90 - required=True, 97 + parser.add_argument("--tokenizer_name", type=str,
91 - help="directory that all repositories will be downloaded.", 98 + default="microsoft/codebert-base", help="The name of tokenizer",)
92 - ) 99 + parser.add_argument("--num_workers", default=4, type=int, help="number of process")
93 - parser.add_argument( 100 + parser.add_argument("--max_source_length", default=256, type=int,
94 - "--output_dir", 101 + help="The maximum total source sequence length after tokenization. Sequences longer "
95 - type=str, 102 + "than this will be truncated, sequences shorter will be padded.")
96 - required=True, 103 + parser.add_argument("--max_target_length", default=128, type=int,
97 - help="The output directory where the preprocessed data will be written.", 104 + help="The maximum total target sequence length after tokenization. Sequences longer "
98 - ) 105 + "than this will be truncated, sequences shorter will be padded.")
99 - parser.add_argument(
100 - "--tokenizer_name",
101 - type=str,
102 - default="microsoft/codebert-base",
103 - help="The name of tokenizer",
104 - )
105 - parser.add_argument(
106 - "--num_workers",
107 - default=4,
108 - type=int,
109 - help="number of process",
110 - )
111 106
112 args = parser.parse_args() 107 args = parser.parse_args()
113 108
......
1 +https://github.com/donnemartin/system-design-primer
2 +https://github.com/public-apis/public-apis
3 +https://github.com/TheAlgorithms/Python
4 +https://github.com/vinta/awesome-python
5 +https://github.com/tensorflow/models
6 +https://github.com/nvbn/thefuck
7 +https://github.com/django/django
8 +https://github.com/pallets/flask
9 +https://github.com/httpie/httpie
10 +https://github.com/josephmisiti/awesome-machine-learning
11 +https://github.com/ansible/ansible
12 +https://github.com/psf/requests
13 +https://github.com/scikit-learn/scikit-learn
14 +https://github.com/scrapy/scrapy
15 +https://github.com/minimaxir/big-list-of-naughty-strings
16 +https://github.com/ageitgey/face_recognition
17 +https://github.com/home-assistant/core
18 +https://github.com/soimort/you-get
19 +https://github.com/huggingface/transformers
20 +https://github.com/deepfakes/faceswap
21 +https://github.com/apache/incubator-superset
22 +https://github.com/XX-net/XX-Net
23 +https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap
24 +https://github.com/certbot/certbot
25 +https://github.com/pandas-dev/pandas
26 +https://github.com/localstack/localstack
27 +https://github.com/getsentry/sentry
28 +https://github.com/3b1b/manim
29 +https://github.com/faif/python-patterns
30 +https://github.com/google-research/bert
31 +https://github.com/facebookresearch/Detectron
32 +https://github.com/openai/gym
33 +https://github.com/tiangolo/fastapi
34 +https://github.com/ycm-core/YouCompleteMe
35 +https://github.com/0xAX/linux-insides
36 +https://github.com/satwikkansal/wtfpython
37 +https://github.com/pypa/pipenv
38 +https://github.com/CorentinJ/Real-Time-Voice-Cloning
39 +https://github.com/donnemartin/interactive-coding-challenges
40 +https://github.com/docker/compose
41 +https://github.com/iperov/DeepFaceLab
42 +https://github.com/mitmproxy/mitmproxy
43 +https://github.com/donnemartin/data-science-ipython-notebooks
44 +https://github.com/tornadoweb/tornado
45 +https://github.com/chubin/cheat.sh
46 +https://github.com/trailofbits/algo
47 +https://github.com/geekcomputers/Python
48 +https://github.com/encode/django-rest-framework
49 +https://github.com/d2l-ai/d2l-zh
50 +https://github.com/apache/airflow
51 +https://github.com/matterport/Mask_RCNN
52 +https://github.com/swisskyrepo/PayloadsAllTheThings
53 +https://github.com/yunjey/pytorch-tutorial
54 +https://github.com/sqlmapproject/sqlmap
55 +https://github.com/psf/black
56 +https://github.com/eriklindernoren/ML-From-Scratch
57 +https://github.com/keon/algorithms
58 +https://github.com/google/python-fire
59 +https://github.com/explosion/spaCy
60 +https://github.com/drduh/macOS-Security-and-Privacy-Guide
61 +https://github.com/nicolargo/glances
62 +https://github.com/sebastianruder/NLP-progress
63 +https://github.com/StevenBlack/hosts
64 +https://github.com/tqdm/tqdm
65 +https://github.com/celery/celery
66 +https://github.com/magenta/magenta
67 +https://github.com/gto76/python-cheatsheet
68 +https://github.com/reddit-archive/reddit
69 +https://github.com/numpy/numpy
70 +https://github.com/sherlock-project/sherlock
71 +https://github.com/instillai/TensorFlow-Course
72 +https://github.com/charlax/professional-programming
73 +https://github.com/binux/pyspider
74 +https://github.com/ipython/ipython
75 +https://github.com/deezer/spleeter
76 +https://github.com/pytorch/examples
77 +https://github.com/toml-lang/toml
78 +https://github.com/luong-komorebi/Awesome-Linux-Software
79 +https://github.com/bokeh/bokeh
80 +https://github.com/bitcoinbook/bitcoinbook
81 +https://github.com/locustio/locust
82 +https://github.com/nginx-proxy/nginx-proxy
83 +https://github.com/microsoft/cascadia-code
84 +https://github.com/OWASP/CheatSheetSeries
85 +https://github.com/spotify/luigi
86 +https://github.com/cool-RR/PySnooper
87 +https://github.com/ray-project/ray
88 +https://github.com/openai/gpt-2
89 +https://github.com/willmcgugan/rich
90 +https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
91 +https://github.com/facebookresearch/detectron2
92 +https://github.com/plotly/dash
93 +https://github.com/PaddlePaddle/Paddle
94 +https://github.com/cookiecutter/cookiecutter
95 +https://github.com/chubin/wttr.in
96 +https://github.com/zulip/zulip
97 +https://github.com/python-poetry/poetry
98 +https://github.com/fabric/fabric
99 +https://github.com/matplotlib/matplotlib
100 +https://github.com/tzutalin/labelImg
...\ No newline at end of file ...\ No newline at end of file