(fix) git clone in top 100 python repositories and change parser rule

graykode
Commit 4c89b4b6bbe7f003572d36c780d03d46336aefc5 4c89b4b6 1 parent 28ef1238
Showing 3 changed files with 152 additions and 60 deletions
gitcloner.py
gitparser.py
repositories.txt
--- a/gitcloner.py
View file @4c89b4b
+++ b/gitcloner.py
View file @4c89b4b
@@ -14,10 +14,8 @@
 
 import os
 import git
- import json
 import argparse
 from git import Repo
- from tqdm import tqdm
 from time import sleep
 from queue import Queue
 from threading import Thread
@@ -55,7 +53,7 @@ class ClonePooler(object):
             )
             sleep(0.1)
             self.count += 1
-             print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}")
+             print(f"{self.count}/{self.total_repos} {format((self.count/self.total_repos) * 100, '.2f')}")
         except git.exc.InvalidGitRepositoryError:
             print(f'{repo} is not found.')
         except git.exc.GitError as e:
@@ -65,11 +63,10 @@ def main(args):
 
     os.makedirs(args.repos_dir, exist_ok=True)
     repos = set()
-     with open(args.jsonl_file, encoding="utf-8") as f:
+     with open(args.repositories, encoding="utf-8") as f:
         for idx, line in enumerate(f):
             line = line.strip()
-             js = json.loads(line)
-             repos.add(js['repo'])
+             repos.add(line.replace('https://github.com/', ''))
 
     pooler = ClonePooler(
         total_repos=len(repos)
@@ -80,8 +77,8 @@ def main(args):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="")
-     parser.add_argument("--jsonl_file", type=str, required=True,
-                         help="jsonl file path.")
+     parser.add_argument("--repositories", type=str, required=True,
+                         help="repositories file path.")
     parser.add_argument("--repos_dir", type=str, required=True,
                         help="directory that all repositories will be downloaded.")
     parser.add_argument("--num_worker_threads", type=int, default=16,
--- a/gitparser.py
View file @4c89b4b
+++ b/gitparser.py
View file @4c89b4b
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+ import re
 import json
 import jsonlines
 import argparse
@@ -23,91 +24,85 @@ from multiprocessing.pool import Pool
 from transformers import RobertaTokenizer
 from pydriller import GitRepository, RepositoryMining
 
+ def message_cleaner(message):
+     msg = message.split("\n")[0]
+     msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
+     return msg
 
- def jobs(repo_paths, args):
-     repo, paths = repo_paths
-     repo_path = os.path.join(args.repos_dir, repo)
 
+ def jobs(repo, args):
+     repo_path = os.path.join(args.repos_dir, repo)
     if os.path.exists(repo_path):
-         gr = GitRepository(repo_path)
- 
-         for path in paths:
-             commits = gr.get_commits_modified_file(path)
         for commit in RepositoryMining(
-                 repo_path, only_commits=commits
+             repo_path, only_modifications_with_file_types=['.py']
         ).traverse_commits():
-                 message = (commit.msg).split("\n")[0]
+             cleaned_message = message_cleaner(commit.msg)
+             tokenized_message = args.tokenizer.tokenize(cleaned_message)
+             if len(tokenized_message) > args.max_target_length:
+                 continue
 
-                 added, deleted = [], []
             for mod in commit.modifications:
-                     if mod.new_path == path:
+                 if not (mod.old_path and mod.new_path):
+                     continue
+                 if os.path.splitext(mod.new_path)[1] != '.py':
+                     continue
+                 if not mod.diff_parsed["added"]:
+                     continue
+                 if not mod.diff_parsed["deleted"]:
+                     continue
+ 
+                 added, deleted = [], []
+ 
                 for line, code in mod.diff_parsed["added"]:
-                             added += args.tokenizer.tokenize(code)
-                             assert isinstance(added, list)
+                     added.extend(args.tokenizer.tokenize(code))
 
                 for line, code in mod.diff_parsed["deleted"]:
-                             deleted += args.tokenizer.tokenize(code)
-                             assert isinstance(deleted, list)
+                     deleted.extend(args.tokenizer.tokenize(code))
 
+                 if len(added) + len(deleted) <= args.max_source_length:
                     with jsonlines.open(args.output_file, mode="a") as writer:
                         writer.write(
                             {
-                                     "repo": repo,
-                                     "path": path,
-                                     "sha": commit.hash,
-                                     "msg": args.tokenizer.tokenize(message),
+                                 "msg": tokenized_message,
                                 "added": added,
                                 "deleted": deleted,
                             }
                         )
 
 def main(args):
-     repos = defaultdict(list)
-     with open(args.jsonl_file, encoding="utf-8") as f:
+     repos = set()
+     with open(args.repositories, encoding="utf-8") as f:
         for idx, line in enumerate(f):
             line = line.strip()
-             js = json.loads(line)
-             repos[js["repo"]].append(js["path"])
+             repos.add(line.replace('https://github.com/', ''))
 
     os.makedirs(args.output_dir, exist_ok=True)
-     args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file))
+     args.output_file = os.path.join(args.output_dir, 'dataset.jsonl')
 
     func = partial(jobs, args=args)
     with Pool(processes=args.num_workers) as pool:
         with tqdm(total=len(repos)) as pbar:
-             for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))):
+             for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos))):
                 pbar.update()
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
-     parser.add_argument(
-         "--jsonl_file", type=str, required=True, help="jsonl file path."
-     )
-     parser.add_argument(
-         "--repos_dir",
-         type=str,
-         required=True,
-         help="directory that all repositories will be downloaded.",
-     )
-     parser.add_argument(
-         "--output_dir",
-         type=str,
-         required=True,
-         help="The output directory where the preprocessed data will be written.",
-     )
-     parser.add_argument(
-         "--tokenizer_name",
-         type=str,
-         default="microsoft/codebert-base",
-         help="The name of tokenizer",
-     )
-     parser.add_argument(
-         "--num_workers",
-         default=4,
-         type=int,
-         help="number of process",
-     )
+     parser.add_argument("--repositories", type=str, required=True,
+                         help="repositories file path.")
+     parser.add_argument("--repos_dir", type=str, required=True,
+                         help="directory that all repositories had been downloaded.",)
+     parser.add_argument("--output_dir", type=str, required=True,
+                         help="The output directory where the preprocessed data will be written.")
+     parser.add_argument("--tokenizer_name", type=str,
+                         default="microsoft/codebert-base", help="The name of tokenizer",)
+     parser.add_argument("--num_workers", default=4, type=int, help="number of process")
+     parser.add_argument("--max_source_length", default=256, type=int,
+                         help="The maximum total source sequence length after tokenization. Sequences longer "
+                              "than this will be truncated, sequences shorter will be padded.")
+     parser.add_argument("--max_target_length", default=128, type=int,
+                         help="The maximum total target sequence length after tokenization. Sequences longer "
+                              "than this will be truncated, sequences shorter will be padded.")
 
     args = parser.parse_args()
 
--- a/repositories.txt 0 → 100644
View file @4c89b4b
+++ b/repositories.txt 0 → 100644
View file @4c89b4b
+ https://github.com/donnemartin/system-design-primer
+ https://github.com/public-apis/public-apis
+ https://github.com/TheAlgorithms/Python
+ https://github.com/vinta/awesome-python
+ https://github.com/tensorflow/models
+ https://github.com/nvbn/thefuck
+ https://github.com/django/django
+ https://github.com/pallets/flask
+ https://github.com/httpie/httpie
+ https://github.com/josephmisiti/awesome-machine-learning
+ https://github.com/ansible/ansible
+ https://github.com/psf/requests
+ https://github.com/scikit-learn/scikit-learn
+ https://github.com/scrapy/scrapy
+ https://github.com/minimaxir/big-list-of-naughty-strings
+ https://github.com/ageitgey/face_recognition
+ https://github.com/home-assistant/core
+ https://github.com/soimort/you-get
+ https://github.com/huggingface/transformers
+ https://github.com/deepfakes/faceswap
+ https://github.com/apache/incubator-superset
+ https://github.com/XX-net/XX-Net
+ https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap
+ https://github.com/certbot/certbot
+ https://github.com/pandas-dev/pandas
+ https://github.com/localstack/localstack
+ https://github.com/getsentry/sentry
+ https://github.com/3b1b/manim
+ https://github.com/faif/python-patterns
+ https://github.com/google-research/bert
+ https://github.com/facebookresearch/Detectron
+ https://github.com/openai/gym
+ https://github.com/tiangolo/fastapi
+ https://github.com/ycm-core/YouCompleteMe
+ https://github.com/0xAX/linux-insides
+ https://github.com/satwikkansal/wtfpython
+ https://github.com/pypa/pipenv
+ https://github.com/CorentinJ/Real-Time-Voice-Cloning
+ https://github.com/donnemartin/interactive-coding-challenges
+ https://github.com/docker/compose
+ https://github.com/iperov/DeepFaceLab
+ https://github.com/mitmproxy/mitmproxy
+ https://github.com/donnemartin/data-science-ipython-notebooks
+ https://github.com/tornadoweb/tornado
+ https://github.com/chubin/cheat.sh
+ https://github.com/trailofbits/algo
+ https://github.com/geekcomputers/Python
+ https://github.com/encode/django-rest-framework
+ https://github.com/d2l-ai/d2l-zh
+ https://github.com/apache/airflow
+ https://github.com/matterport/Mask_RCNN
+ https://github.com/swisskyrepo/PayloadsAllTheThings
+ https://github.com/yunjey/pytorch-tutorial
+ https://github.com/sqlmapproject/sqlmap
+ https://github.com/psf/black
+ https://github.com/eriklindernoren/ML-From-Scratch
+ https://github.com/keon/algorithms
+ https://github.com/google/python-fire
+ https://github.com/explosion/spaCy
+ https://github.com/drduh/macOS-Security-and-Privacy-Guide
+ https://github.com/nicolargo/glances
+ https://github.com/sebastianruder/NLP-progress
+ https://github.com/StevenBlack/hosts
+ https://github.com/tqdm/tqdm
+ https://github.com/celery/celery
+ https://github.com/magenta/magenta
+ https://github.com/gto76/python-cheatsheet
+ https://github.com/reddit-archive/reddit
+ https://github.com/numpy/numpy
+ https://github.com/sherlock-project/sherlock
+ https://github.com/instillai/TensorFlow-Course
+ https://github.com/charlax/professional-programming
+ https://github.com/binux/pyspider
+ https://github.com/ipython/ipython
+ https://github.com/deezer/spleeter
+ https://github.com/pytorch/examples
+ https://github.com/toml-lang/toml
+ https://github.com/luong-komorebi/Awesome-Linux-Software
+ https://github.com/bokeh/bokeh
+ https://github.com/bitcoinbook/bitcoinbook
+ https://github.com/locustio/locust
+ https://github.com/nginx-proxy/nginx-proxy
+ https://github.com/microsoft/cascadia-code
+ https://github.com/OWASP/CheatSheetSeries
+ https://github.com/spotify/luigi
+ https://github.com/cool-RR/PySnooper
+ https://github.com/ray-project/ray
+ https://github.com/openai/gpt-2
+ https://github.com/willmcgugan/rich
+ https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
+ https://github.com/facebookresearch/detectron2
+ https://github.com/plotly/dash
+ https://github.com/PaddlePaddle/Paddle
+ https://github.com/cookiecutter/cookiecutter
+ https://github.com/chubin/wttr.in
+ https://github.com/zulip/zulip
+ https://github.com/python-poetry/poetry
+ https://github.com/fabric/fabric
+ https://github.com/matplotlib/matplotlib
+ https://github.com/tzutalin/labelImg
\ No newline at end of file