(fix) git clone in top 100 python repositories and change parser rule

graykode
Commit 4c89b4b6bbe7f003572d36c780d03d46336aefc5 4c89b4b6 1 parent 28ef1238
Showing 3 changed files with 165 additions and 73 deletions
gitcloner.py
gitparser.py
repositories.txt
--- a/gitcloner.py
View file @4c89b4b
+++ b/gitcloner.py
View file @4c89b4b
@@ -14,10 +14,8 @@
 import os
 import git
-import json
 import argparse
 from git import Repo
-from tqdm import tqdm
 from time import sleep
 from queue import Queue
 from threading import Thread
@@ -55,7 +53,7 @@ class ClonePooler(object):
             )
             sleep(0.1)
             self.count += 1
-            print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}")
+            print(f"{self.count}/{self.total_repos} {format((self.count/self.total_repos) * 100, '.2f')}")
         except git.exc.InvalidGitRepositoryError:
             print(f'{repo} is not found.')
         except git.exc.GitError as e:
@@ -65,11 +63,10 @@ def main(args):
     os.makedirs(args.repos_dir, exist_ok=True)
     repos = set()
-    with open(args.jsonl_file, encoding="utf-8") as f:
+    with open(args.repositories, encoding="utf-8") as f:
         for idx, line in enumerate(f):
             line = line.strip()
-            js = json.loads(line)
+            repos.add(line.replace('https://github.com/', ''))
-            repos.add(js['repo'])
     pooler = ClonePooler(
         total_repos=len(repos)
@@ -80,8 +77,8 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--jsonl_file", type=str, required=True,
+    parser.add_argument("--repositories", type=str, required=True,
-                        help="jsonl file path.")
+                        help="repositories file path.")
     parser.add_argument("--repos_dir", type=str, required=True,
                         help="directory that all repositories will be downloaded.")
     parser.add_argument("--num_worker_threads", type=int, default=16,
--- a/gitparser.py
View file @4c89b4b
+++ b/gitparser.py
View file @4c89b4b
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
+import re
 import json
 import jsonlines
 import argparse
@@ -23,91 +24,85 @@ from multiprocessing.pool import Pool
 from transformers import RobertaTokenizer
 from pydriller import GitRepository, RepositoryMining
+def message_cleaner(message):
+    msg = message.split("\n")[0]
+    msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
+    return msg
-def jobs(repo_paths, args):
-    repo, paths = repo_paths
-    repo_path = os.path.join(args.repos_dir, repo)
+def jobs(repo, args):
+    repo_path = os.path.join(args.repos_dir, repo)
     if os.path.exists(repo_path):
-        gr = GitRepository(repo_path)
+        for commit in RepositoryMining(
-
+            repo_path, only_modifications_with_file_types=['.py']
-        for path in paths:
+        ).traverse_commits():
-            commits = gr.get_commits_modified_file(path)
+            cleaned_message = message_cleaner(commit.msg)
-            for commit in RepositoryMining(
+            tokenized_message = args.tokenizer.tokenize(cleaned_message)
-                repo_path, only_commits=commits
+            if len(tokenized_message) > args.max_target_length:
-            ).traverse_commits():
+                continue
-                message = (commit.msg).split("\n")[0]
+
+            for mod in commit.modifications:
+                if not (mod.old_path and mod.new_path):
+                    continue
+                if os.path.splitext(mod.new_path)[1] != '.py':
+                    continue
+                if not mod.diff_parsed["added"]:
+                    continue
+                if not mod.diff_parsed["deleted"]:
+                    continue
                 added, deleted = [], []
-                for mod in commit.modifications:
+
-                    if mod.new_path == path:
+                for line, code in mod.diff_parsed["added"]:
-                        for line, code in mod.diff_parsed["added"]:
+                    added.extend(args.tokenizer.tokenize(code))
-                            added += args.tokenizer.tokenize(code)
+
-                            assert isinstance(added, list)
+                for line, code in mod.diff_parsed["deleted"]:
-
+                    deleted.extend(args.tokenizer.tokenize(code))
-                        for line, code in mod.diff_parsed["deleted"]:
+
-                            deleted += args.tokenizer.tokenize(code)
+                if len(added) + len(deleted) <= args.max_source_length:
-                            assert isinstance(deleted, list)
+                    with jsonlines.open(args.output_file, mode="a") as writer:
-
+                        writer.write(
-                        with jsonlines.open(args.output_file, mode="a") as writer:
+                            {
-                            writer.write(
+                                "msg": tokenized_message,
-                                {
+                                "added": added,
-                                    "repo": repo,
+                                "deleted": deleted,
-                                    "path": path,
+                            }
-                                    "sha": commit.hash,
+                        )
-                                    "msg": args.tokenizer.tokenize(message),
-                                    "added": added,
-                                    "deleted": deleted,
-                                }
-                            )
 def main(args):
-    repos = defaultdict(list)
+    repos = set()
-    with open(args.jsonl_file, encoding="utf-8") as f:
+    with open(args.repositories, encoding="utf-8") as f:
         for idx, line in enumerate(f):
             line = line.strip()
-            js = json.loads(line)
+            repos.add(line.replace('https://github.com/', ''))
-            repos[js["repo"]].append(js["path"])
     os.makedirs(args.output_dir, exist_ok=True)
-    args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file))
+    args.output_file = os.path.join(args.output_dir, 'dataset.jsonl')
     func = partial(jobs, args=args)
     with Pool(processes=args.num_workers) as pool:
         with tqdm(total=len(repos)) as pbar:
-            for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))):
+            for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos))):
                 pbar.update()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument(
+    parser.add_argument("--repositories", type=str, required=True,
-        "--jsonl_file", type=str, required=True, help="jsonl file path."
+                        help="repositories file path.")
-    )
+    parser.add_argument("--repos_dir", type=str, required=True,
-    parser.add_argument(
+                        help="directory that all repositories had been downloaded.",)
-        "--repos_dir",
+    parser.add_argument("--output_dir", type=str, required=True,
-        type=str,
+                        help="The output directory where the preprocessed data will be written.")
-        required=True,
+    parser.add_argument("--tokenizer_name", type=str,
-        help="directory that all repositories will be downloaded.",
+                        default="microsoft/codebert-base", help="The name of tokenizer",)
-    )
+    parser.add_argument("--num_workers", default=4, type=int, help="number of process")
-    parser.add_argument(
+    parser.add_argument("--max_source_length", default=256, type=int,
-        "--output_dir",
+                        help="The maximum total source sequence length after tokenization. Sequences longer "
-        type=str,
+                             "than this will be truncated, sequences shorter will be padded.")
-        required=True,
+    parser.add_argument("--max_target_length", default=128, type=int,
-        help="The output directory where the preprocessed data will be written.",
+                        help="The maximum total target sequence length after tokenization. Sequences longer "
-    )
+                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default="microsoft/codebert-base",
-        help="The name of tokenizer",
-    )
-    parser.add_argument(
-        "--num_workers",
-        default=4,
-        type=int,
-        help="number of process",
-    )
     args = parser.parse_args()
--- a/repositories.txt 0 → 100644
View file @4c89b4b
+++ b/repositories.txt 0 → 100644
View file @4c89b4b
+https://github.com/donnemartin/system-design-primer
+https://github.com/public-apis/public-apis
+https://github.com/TheAlgorithms/Python
+https://github.com/vinta/awesome-python
+https://github.com/tensorflow/models
+https://github.com/nvbn/thefuck
+https://github.com/django/django
+https://github.com/pallets/flask
+https://github.com/httpie/httpie
+https://github.com/josephmisiti/awesome-machine-learning
+https://github.com/ansible/ansible
+https://github.com/psf/requests
+https://github.com/scikit-learn/scikit-learn
+https://github.com/scrapy/scrapy
+https://github.com/minimaxir/big-list-of-naughty-strings
+https://github.com/ageitgey/face_recognition
+https://github.com/home-assistant/core
+https://github.com/soimort/you-get
+https://github.com/huggingface/transformers
+https://github.com/deepfakes/faceswap
+https://github.com/apache/incubator-superset
+https://github.com/XX-net/XX-Net
+https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap
+https://github.com/certbot/certbot
+https://github.com/pandas-dev/pandas
+https://github.com/localstack/localstack
+https://github.com/getsentry/sentry
+https://github.com/3b1b/manim
+https://github.com/faif/python-patterns
+https://github.com/google-research/bert
+https://github.com/facebookresearch/Detectron
+https://github.com/openai/gym
+https://github.com/tiangolo/fastapi
+https://github.com/ycm-core/YouCompleteMe
+https://github.com/0xAX/linux-insides
+https://github.com/satwikkansal/wtfpython
+https://github.com/pypa/pipenv
+https://github.com/CorentinJ/Real-Time-Voice-Cloning
+https://github.com/donnemartin/interactive-coding-challenges
+https://github.com/docker/compose
+https://github.com/iperov/DeepFaceLab
+https://github.com/mitmproxy/mitmproxy
+https://github.com/donnemartin/data-science-ipython-notebooks
+https://github.com/tornadoweb/tornado
+https://github.com/chubin/cheat.sh
+https://github.com/trailofbits/algo
+https://github.com/geekcomputers/Python
+https://github.com/encode/django-rest-framework
+https://github.com/d2l-ai/d2l-zh
+https://github.com/apache/airflow
+https://github.com/matterport/Mask_RCNN
+https://github.com/swisskyrepo/PayloadsAllTheThings
+https://github.com/yunjey/pytorch-tutorial
+https://github.com/sqlmapproject/sqlmap
+https://github.com/psf/black
+https://github.com/eriklindernoren/ML-From-Scratch
+https://github.com/keon/algorithms
+https://github.com/google/python-fire
+https://github.com/explosion/spaCy
+https://github.com/drduh/macOS-Security-and-Privacy-Guide
+https://github.com/nicolargo/glances
+https://github.com/sebastianruder/NLP-progress
+https://github.com/StevenBlack/hosts
+https://github.com/tqdm/tqdm
+https://github.com/celery/celery
+https://github.com/magenta/magenta
+https://github.com/gto76/python-cheatsheet
+https://github.com/reddit-archive/reddit
+https://github.com/numpy/numpy
+https://github.com/sherlock-project/sherlock
+https://github.com/instillai/TensorFlow-Course
+https://github.com/charlax/professional-programming
+https://github.com/binux/pyspider
+https://github.com/ipython/ipython
+https://github.com/deezer/spleeter
+https://github.com/pytorch/examples
+https://github.com/toml-lang/toml
+https://github.com/luong-komorebi/Awesome-Linux-Software
+https://github.com/bokeh/bokeh
+https://github.com/bitcoinbook/bitcoinbook
+https://github.com/locustio/locust
+https://github.com/nginx-proxy/nginx-proxy
+https://github.com/microsoft/cascadia-code
+https://github.com/OWASP/CheatSheetSeries
+https://github.com/spotify/luigi
+https://github.com/cool-RR/PySnooper
+https://github.com/ray-project/ray
+https://github.com/openai/gpt-2
+https://github.com/willmcgugan/rich
+https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
+https://github.com/facebookresearch/detectron2
+https://github.com/plotly/dash
+https://github.com/PaddlePaddle/Paddle
+https://github.com/cookiecutter/cookiecutter
+https://github.com/chubin/wttr.in
+https://github.com/zulip/zulip
+https://github.com/python-poetry/poetry
+https://github.com/fabric/fabric
+https://github.com/matplotlib/matplotlib
+https://github.com/tzutalin/labelImg
\ No newline at end of file