graykode

(fix) git clone in top 100 python repositories and change parser rule

......@@ -14,10 +14,8 @@
import os
import git
import json
import argparse
from git import Repo
from tqdm import tqdm
from time import sleep
from queue import Queue
from threading import Thread
......@@ -55,7 +53,7 @@ class ClonePooler(object):
)
sleep(0.1)
self.count += 1
print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}")
print(f"{self.count}/{self.total_repos} {format((self.count/self.total_repos) * 100, '.2f')}")
except git.exc.InvalidGitRepositoryError:
print(f'{repo} is not found.')
except git.exc.GitError as e:
......@@ -65,11 +63,10 @@ def main(args):
os.makedirs(args.repos_dir, exist_ok=True)
repos = set()
with open(args.jsonl_file, encoding="utf-8") as f:
with open(args.repositories, encoding="utf-8") as f:
for idx, line in enumerate(f):
line = line.strip()
js = json.loads(line)
repos.add(js['repo'])
repos.add(line.replace('https://github.com/', ''))
pooler = ClonePooler(
total_repos=len(repos)
......@@ -80,8 +77,8 @@ def main(args):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="")
parser.add_argument("--jsonl_file", type=str, required=True,
help="jsonl file path.")
parser.add_argument("--repositories", type=str, required=True,
help="repositories file path.")
parser.add_argument("--repos_dir", type=str, required=True,
help="directory that all repositories will be downloaded.")
parser.add_argument("--num_worker_threads", type=int, default=16,
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import os
import re
import json
import jsonlines
import argparse
......@@ -23,91 +24,85 @@ from multiprocessing.pool import Pool
from transformers import RobertaTokenizer
from pydriller import GitRepository, RepositoryMining
def message_cleaner(message):
msg = message.split("\n")[0]
msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
return msg
def jobs(repo_paths, args):
repo, paths = repo_paths
repo_path = os.path.join(args.repos_dir, repo)
def jobs(repo, args):
repo_path = os.path.join(args.repos_dir, repo)
if os.path.exists(repo_path):
gr = GitRepository(repo_path)
for path in paths:
commits = gr.get_commits_modified_file(path)
for commit in RepositoryMining(
repo_path, only_commits=commits
).traverse_commits():
message = (commit.msg).split("\n")[0]
for commit in RepositoryMining(
repo_path, only_modifications_with_file_types=['.py']
).traverse_commits():
cleaned_message = message_cleaner(commit.msg)
tokenized_message = args.tokenizer.tokenize(cleaned_message)
if len(tokenized_message) > args.max_target_length:
continue
for mod in commit.modifications:
if not (mod.old_path and mod.new_path):
continue
if os.path.splitext(mod.new_path)[1] != '.py':
continue
if not mod.diff_parsed["added"]:
continue
if not mod.diff_parsed["deleted"]:
continue
added, deleted = [], []
for mod in commit.modifications:
if mod.new_path == path:
for line, code in mod.diff_parsed["added"]:
added += args.tokenizer.tokenize(code)
assert isinstance(added, list)
for line, code in mod.diff_parsed["deleted"]:
deleted += args.tokenizer.tokenize(code)
assert isinstance(deleted, list)
with jsonlines.open(args.output_file, mode="a") as writer:
writer.write(
{
"repo": repo,
"path": path,
"sha": commit.hash,
"msg": args.tokenizer.tokenize(message),
"added": added,
"deleted": deleted,
}
)
for line, code in mod.diff_parsed["added"]:
added.extend(args.tokenizer.tokenize(code))
for line, code in mod.diff_parsed["deleted"]:
deleted.extend(args.tokenizer.tokenize(code))
if len(added) + len(deleted) <= args.max_source_length:
with jsonlines.open(args.output_file, mode="a") as writer:
writer.write(
{
"msg": tokenized_message,
"added": added,
"deleted": deleted,
}
)
def main(args):
repos = defaultdict(list)
with open(args.jsonl_file, encoding="utf-8") as f:
repos = set()
with open(args.repositories, encoding="utf-8") as f:
for idx, line in enumerate(f):
line = line.strip()
js = json.loads(line)
repos[js["repo"]].append(js["path"])
repos.add(line.replace('https://github.com/', ''))
os.makedirs(args.output_dir, exist_ok=True)
args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file))
args.output_file = os.path.join(args.output_dir, 'dataset.jsonl')
func = partial(jobs, args=args)
with Pool(processes=args.num_workers) as pool:
with tqdm(total=len(repos)) as pbar:
for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))):
for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos))):
pbar.update()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")
parser.add_argument(
"--jsonl_file", type=str, required=True, help="jsonl file path."
)
parser.add_argument(
"--repos_dir",
type=str,
required=True,
help="directory that all repositories will be downloaded.",
)
parser.add_argument(
"--output_dir",
type=str,
required=True,
help="The output directory where the preprocessed data will be written.",
)
parser.add_argument(
"--tokenizer_name",
type=str,
default="microsoft/codebert-base",
help="The name of tokenizer",
)
parser.add_argument(
"--num_workers",
default=4,
type=int,
help="number of process",
)
parser.add_argument("--repositories", type=str, required=True,
help="repositories file path.")
parser.add_argument("--repos_dir", type=str, required=True,
help="directory that all repositories had been downloaded.",)
parser.add_argument("--output_dir", type=str, required=True,
help="The output directory where the preprocessed data will be written.")
parser.add_argument("--tokenizer_name", type=str,
default="microsoft/codebert-base", help="The name of tokenizer",)
parser.add_argument("--num_workers", default=4, type=int, help="number of process")
parser.add_argument("--max_source_length", default=256, type=int,
help="The maximum total source sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--max_target_length", default=128, type=int,
help="The maximum total target sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.")
args = parser.parse_args()
......
https://github.com/donnemartin/system-design-primer
https://github.com/public-apis/public-apis
https://github.com/TheAlgorithms/Python
https://github.com/vinta/awesome-python
https://github.com/tensorflow/models
https://github.com/nvbn/thefuck
https://github.com/django/django
https://github.com/pallets/flask
https://github.com/httpie/httpie
https://github.com/josephmisiti/awesome-machine-learning
https://github.com/ansible/ansible
https://github.com/psf/requests
https://github.com/scikit-learn/scikit-learn
https://github.com/scrapy/scrapy
https://github.com/minimaxir/big-list-of-naughty-strings
https://github.com/ageitgey/face_recognition
https://github.com/home-assistant/core
https://github.com/soimort/you-get
https://github.com/huggingface/transformers
https://github.com/deepfakes/faceswap
https://github.com/apache/incubator-superset
https://github.com/XX-net/XX-Net
https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap
https://github.com/certbot/certbot
https://github.com/pandas-dev/pandas
https://github.com/localstack/localstack
https://github.com/getsentry/sentry
https://github.com/3b1b/manim
https://github.com/faif/python-patterns
https://github.com/google-research/bert
https://github.com/facebookresearch/Detectron
https://github.com/openai/gym
https://github.com/tiangolo/fastapi
https://github.com/ycm-core/YouCompleteMe
https://github.com/0xAX/linux-insides
https://github.com/satwikkansal/wtfpython
https://github.com/pypa/pipenv
https://github.com/CorentinJ/Real-Time-Voice-Cloning
https://github.com/donnemartin/interactive-coding-challenges
https://github.com/docker/compose
https://github.com/iperov/DeepFaceLab
https://github.com/mitmproxy/mitmproxy
https://github.com/donnemartin/data-science-ipython-notebooks
https://github.com/tornadoweb/tornado
https://github.com/chubin/cheat.sh
https://github.com/trailofbits/algo
https://github.com/geekcomputers/Python
https://github.com/encode/django-rest-framework
https://github.com/d2l-ai/d2l-zh
https://github.com/apache/airflow
https://github.com/matterport/Mask_RCNN
https://github.com/swisskyrepo/PayloadsAllTheThings
https://github.com/yunjey/pytorch-tutorial
https://github.com/sqlmapproject/sqlmap
https://github.com/psf/black
https://github.com/eriklindernoren/ML-From-Scratch
https://github.com/keon/algorithms
https://github.com/google/python-fire
https://github.com/explosion/spaCy
https://github.com/drduh/macOS-Security-and-Privacy-Guide
https://github.com/nicolargo/glances
https://github.com/sebastianruder/NLP-progress
https://github.com/StevenBlack/hosts
https://github.com/tqdm/tqdm
https://github.com/celery/celery
https://github.com/magenta/magenta
https://github.com/gto76/python-cheatsheet
https://github.com/reddit-archive/reddit
https://github.com/numpy/numpy
https://github.com/sherlock-project/sherlock
https://github.com/instillai/TensorFlow-Course
https://github.com/charlax/professional-programming
https://github.com/binux/pyspider
https://github.com/ipython/ipython
https://github.com/deezer/spleeter
https://github.com/pytorch/examples
https://github.com/toml-lang/toml
https://github.com/luong-komorebi/Awesome-Linux-Software
https://github.com/bokeh/bokeh
https://github.com/bitcoinbook/bitcoinbook
https://github.com/locustio/locust
https://github.com/nginx-proxy/nginx-proxy
https://github.com/microsoft/cascadia-code
https://github.com/OWASP/CheatSheetSeries
https://github.com/spotify/luigi
https://github.com/cool-RR/PySnooper
https://github.com/ray-project/ray
https://github.com/openai/gpt-2
https://github.com/willmcgugan/rich
https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
https://github.com/facebookresearch/detectron2
https://github.com/plotly/dash
https://github.com/PaddlePaddle/Paddle
https://github.com/cookiecutter/cookiecutter
https://github.com/chubin/wttr.in
https://github.com/zulip/zulip
https://github.com/python-poetry/poetry
https://github.com/fabric/fabric
https://github.com/matplotlib/matplotlib
https://github.com/tzutalin/labelImg
\ No newline at end of file