Toggle navigation
Toggle navigation
This project
Loading...
Sign in
graykode
/
commit-autosuggestions
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
graykode
2020-10-30 21:03:31 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
4c89b4b6bbe7f003572d36c780d03d46336aefc5
4c89b4b6
1 parent
28ef1238
(fix) git clone in top 100 python repositories and change parser rule
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
152 additions
and
60 deletions
gitcloner.py
gitparser.py
repositories.txt
gitcloner.py
View file @
4c89b4b
...
...
@@ -14,10 +14,8 @@
import
os
import
git
import
json
import
argparse
from
git
import
Repo
from
tqdm
import
tqdm
from
time
import
sleep
from
queue
import
Queue
from
threading
import
Thread
...
...
@@ -55,7 +53,7 @@ class ClonePooler(object):
)
sleep
(
0.1
)
self
.
count
+=
1
print
(
f
"{self.count}/{self.total_repos} {
(self.count/self.total_repos) * 100
}"
)
print
(
f
"{self.count}/{self.total_repos} {
format((self.count/self.total_repos) * 100, '.2f')
}"
)
except
git
.
exc
.
InvalidGitRepositoryError
:
print
(
f
'{repo} is not found.'
)
except
git
.
exc
.
GitError
as
e
:
...
...
@@ -65,11 +63,10 @@ def main(args):
os
.
makedirs
(
args
.
repos_dir
,
exist_ok
=
True
)
repos
=
set
()
with
open
(
args
.
jsonl_file
,
encoding
=
"utf-8"
)
as
f
:
with
open
(
args
.
repositories
,
encoding
=
"utf-8"
)
as
f
:
for
idx
,
line
in
enumerate
(
f
):
line
=
line
.
strip
()
js
=
json
.
loads
(
line
)
repos
.
add
(
js
[
'repo'
])
repos
.
add
(
line
.
replace
(
'https://github.com/'
,
''
))
pooler
=
ClonePooler
(
total_repos
=
len
(
repos
)
...
...
@@ -80,8 +77,8 @@ def main(args):
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
""
)
parser
.
add_argument
(
"--
jsonl_file
"
,
type
=
str
,
required
=
True
,
help
=
"
jsonl
file path."
)
parser
.
add_argument
(
"--
repositories
"
,
type
=
str
,
required
=
True
,
help
=
"
repositories
file path."
)
parser
.
add_argument
(
"--repos_dir"
,
type
=
str
,
required
=
True
,
help
=
"directory that all repositories will be downloaded."
)
parser
.
add_argument
(
"--num_worker_threads"
,
type
=
int
,
default
=
16
,
...
...
gitparser.py
View file @
4c89b4b
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
import
os
import
re
import
json
import
jsonlines
import
argparse
...
...
@@ -23,91 +24,85 @@ from multiprocessing.pool import Pool
from
transformers
import
RobertaTokenizer
from
pydriller
import
GitRepository
,
RepositoryMining
def
message_cleaner
(
message
):
msg
=
message
.
split
(
"
\n
"
)[
0
]
msg
=
re
.
sub
(
r"(\(|)#([0-9])+(\)|)"
,
""
,
msg
)
return
msg
def
jobs
(
repo_paths
,
args
):
repo
,
paths
=
repo_paths
repo_path
=
os
.
path
.
join
(
args
.
repos_dir
,
repo
)
def
jobs
(
repo
,
args
):
repo_path
=
os
.
path
.
join
(
args
.
repos_dir
,
repo
)
if
os
.
path
.
exists
(
repo_path
):
gr
=
GitRepository
(
repo_path
)
for
path
in
paths
:
commits
=
gr
.
get_commits_modified_file
(
path
)
for
commit
in
RepositoryMining
(
repo_path
,
only_commits
=
commits
repo_path
,
only_modifications_with_file_types
=
[
'.py'
]
)
.
traverse_commits
():
message
=
(
commit
.
msg
)
.
split
(
"
\n
"
)[
0
]
cleaned_message
=
message_cleaner
(
commit
.
msg
)
tokenized_message
=
args
.
tokenizer
.
tokenize
(
cleaned_message
)
if
len
(
tokenized_message
)
>
args
.
max_target_length
:
continue
added
,
deleted
=
[],
[]
for
mod
in
commit
.
modifications
:
if
mod
.
new_path
==
path
:
if
not
(
mod
.
old_path
and
mod
.
new_path
):
continue
if
os
.
path
.
splitext
(
mod
.
new_path
)[
1
]
!=
'.py'
:
continue
if
not
mod
.
diff_parsed
[
"added"
]:
continue
if
not
mod
.
diff_parsed
[
"deleted"
]:
continue
added
,
deleted
=
[],
[]
for
line
,
code
in
mod
.
diff_parsed
[
"added"
]:
added
+=
args
.
tokenizer
.
tokenize
(
code
)
assert
isinstance
(
added
,
list
)
added
.
extend
(
args
.
tokenizer
.
tokenize
(
code
))
for
line
,
code
in
mod
.
diff_parsed
[
"deleted"
]:
deleted
+=
args
.
tokenizer
.
tokenize
(
code
)
assert
isinstance
(
deleted
,
list
)
deleted
.
extend
(
args
.
tokenizer
.
tokenize
(
code
))
if
len
(
added
)
+
len
(
deleted
)
<=
args
.
max_source_length
:
with
jsonlines
.
open
(
args
.
output_file
,
mode
=
"a"
)
as
writer
:
writer
.
write
(
{
"repo"
:
repo
,
"path"
:
path
,
"sha"
:
commit
.
hash
,
"msg"
:
args
.
tokenizer
.
tokenize
(
message
),
"msg"
:
tokenized_message
,
"added"
:
added
,
"deleted"
:
deleted
,
}
)
def
main
(
args
):
repos
=
defaultdict
(
list
)
with
open
(
args
.
jsonl_file
,
encoding
=
"utf-8"
)
as
f
:
repos
=
set
(
)
with
open
(
args
.
repositories
,
encoding
=
"utf-8"
)
as
f
:
for
idx
,
line
in
enumerate
(
f
):
line
=
line
.
strip
()
js
=
json
.
loads
(
line
)
repos
[
js
[
"repo"
]]
.
append
(
js
[
"path"
])
repos
.
add
(
line
.
replace
(
'https://github.com/'
,
''
))
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
args
.
output_file
=
os
.
path
.
join
(
args
.
output_dir
,
os
.
path
.
basename
(
args
.
jsonl_file
)
)
args
.
output_file
=
os
.
path
.
join
(
args
.
output_dir
,
'dataset.jsonl'
)
func
=
partial
(
jobs
,
args
=
args
)
with
Pool
(
processes
=
args
.
num_workers
)
as
pool
:
with
tqdm
(
total
=
len
(
repos
))
as
pbar
:
for
i
,
_
in
tqdm
(
enumerate
(
pool
.
imap_unordered
(
func
,
repos
.
items
()
))):
for
i
,
_
in
tqdm
(
enumerate
(
pool
.
imap_unordered
(
func
,
repos
))):
pbar
.
update
()
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
""
)
parser
.
add_argument
(
"--jsonl_file"
,
type
=
str
,
required
=
True
,
help
=
"jsonl file path."
)
parser
.
add_argument
(
"--repos_dir"
,
type
=
str
,
required
=
True
,
help
=
"directory that all repositories will be downloaded."
,
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the preprocessed data will be written."
,
)
parser
.
add_argument
(
"--tokenizer_name"
,
type
=
str
,
default
=
"microsoft/codebert-base"
,
help
=
"The name of tokenizer"
,
)
parser
.
add_argument
(
"--num_workers"
,
default
=
4
,
type
=
int
,
help
=
"number of process"
,
)
parser
.
add_argument
(
"--repositories"
,
type
=
str
,
required
=
True
,
help
=
"repositories file path."
)
parser
.
add_argument
(
"--repos_dir"
,
type
=
str
,
required
=
True
,
help
=
"directory that all repositories had been downloaded."
,)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the preprocessed data will be written."
)
parser
.
add_argument
(
"--tokenizer_name"
,
type
=
str
,
default
=
"microsoft/codebert-base"
,
help
=
"The name of tokenizer"
,)
parser
.
add_argument
(
"--num_workers"
,
default
=
4
,
type
=
int
,
help
=
"number of process"
)
parser
.
add_argument
(
"--max_source_length"
,
default
=
256
,
type
=
int
,
help
=
"The maximum total source sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
parser
.
add_argument
(
"--max_target_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total target sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
args
=
parser
.
parse_args
()
...
...
repositories.txt
0 → 100644
View file @
4c89b4b
https://github.com/donnemartin/system-design-primer
https://github.com/public-apis/public-apis
https://github.com/TheAlgorithms/Python
https://github.com/vinta/awesome-python
https://github.com/tensorflow/models
https://github.com/nvbn/thefuck
https://github.com/django/django
https://github.com/pallets/flask
https://github.com/httpie/httpie
https://github.com/josephmisiti/awesome-machine-learning
https://github.com/ansible/ansible
https://github.com/psf/requests
https://github.com/scikit-learn/scikit-learn
https://github.com/scrapy/scrapy
https://github.com/minimaxir/big-list-of-naughty-strings
https://github.com/ageitgey/face_recognition
https://github.com/home-assistant/core
https://github.com/soimort/you-get
https://github.com/huggingface/transformers
https://github.com/deepfakes/faceswap
https://github.com/apache/incubator-superset
https://github.com/XX-net/XX-Net
https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap
https://github.com/certbot/certbot
https://github.com/pandas-dev/pandas
https://github.com/localstack/localstack
https://github.com/getsentry/sentry
https://github.com/3b1b/manim
https://github.com/faif/python-patterns
https://github.com/google-research/bert
https://github.com/facebookresearch/Detectron
https://github.com/openai/gym
https://github.com/tiangolo/fastapi
https://github.com/ycm-core/YouCompleteMe
https://github.com/0xAX/linux-insides
https://github.com/satwikkansal/wtfpython
https://github.com/pypa/pipenv
https://github.com/CorentinJ/Real-Time-Voice-Cloning
https://github.com/donnemartin/interactive-coding-challenges
https://github.com/docker/compose
https://github.com/iperov/DeepFaceLab
https://github.com/mitmproxy/mitmproxy
https://github.com/donnemartin/data-science-ipython-notebooks
https://github.com/tornadoweb/tornado
https://github.com/chubin/cheat.sh
https://github.com/trailofbits/algo
https://github.com/geekcomputers/Python
https://github.com/encode/django-rest-framework
https://github.com/d2l-ai/d2l-zh
https://github.com/apache/airflow
https://github.com/matterport/Mask_RCNN
https://github.com/swisskyrepo/PayloadsAllTheThings
https://github.com/yunjey/pytorch-tutorial
https://github.com/sqlmapproject/sqlmap
https://github.com/psf/black
https://github.com/eriklindernoren/ML-From-Scratch
https://github.com/keon/algorithms
https://github.com/google/python-fire
https://github.com/explosion/spaCy
https://github.com/drduh/macOS-Security-and-Privacy-Guide
https://github.com/nicolargo/glances
https://github.com/sebastianruder/NLP-progress
https://github.com/StevenBlack/hosts
https://github.com/tqdm/tqdm
https://github.com/celery/celery
https://github.com/magenta/magenta
https://github.com/gto76/python-cheatsheet
https://github.com/reddit-archive/reddit
https://github.com/numpy/numpy
https://github.com/sherlock-project/sherlock
https://github.com/instillai/TensorFlow-Course
https://github.com/charlax/professional-programming
https://github.com/binux/pyspider
https://github.com/ipython/ipython
https://github.com/deezer/spleeter
https://github.com/pytorch/examples
https://github.com/toml-lang/toml
https://github.com/luong-komorebi/Awesome-Linux-Software
https://github.com/bokeh/bokeh
https://github.com/bitcoinbook/bitcoinbook
https://github.com/locustio/locust
https://github.com/nginx-proxy/nginx-proxy
https://github.com/microsoft/cascadia-code
https://github.com/OWASP/CheatSheetSeries
https://github.com/spotify/luigi
https://github.com/cool-RR/PySnooper
https://github.com/ray-project/ray
https://github.com/openai/gpt-2
https://github.com/willmcgugan/rich
https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
https://github.com/facebookresearch/detectron2
https://github.com/plotly/dash
https://github.com/PaddlePaddle/Paddle
https://github.com/cookiecutter/cookiecutter
https://github.com/chubin/wttr.in
https://github.com/zulip/zulip
https://github.com/python-poetry/poetry
https://github.com/fabric/fabric
https://github.com/matplotlib/matplotlib
https://github.com/tzutalin/labelImg
\ No newline at end of file
Please
register
or
login
to post a comment