Toggle navigation
Toggle navigation
This project
Loading...
Sign in
graykode
/
commit-autosuggestions
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
graykode
2020-10-30 21:56:39 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
3d9624b737905098346b69ab810041a1c292cb3a
3d9624b7
1 parent
4c89b4b6
(add) splitting code
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
2 deletions
gitparser.py
gitparser.py
View file @
3d9624b
...
...
@@ -15,14 +15,14 @@
import
os
import
re
import
json
import
random
import
jsonlines
import
argparse
from
tqdm
import
tqdm
from
functools
import
partial
from
collections
import
defaultdict
from
multiprocessing.pool
import
Pool
from
transformers
import
RobertaTokenizer
from
pydriller
import
GitRepository
,
RepositoryMining
from
pydriller
import
RepositoryMining
def
message_cleaner
(
message
):
msg
=
message
.
split
(
"
\n
"
)[
0
]
...
...
@@ -69,6 +69,12 @@ def jobs(repo, args):
}
)
def
write_jsonl
(
lines
,
path
,
mode
):
saved_path
=
os
.
path
.
join
(
path
,
mode
)
for
line
in
lines
:
with
jsonlines
.
open
(
f
"{saved_path}.jsonl"
,
mode
=
"a"
)
as
writer
:
writer
.
write
(
line
)
def
main
(
args
):
repos
=
set
()
with
open
(
args
.
repositories
,
encoding
=
"utf-8"
)
as
f
:
...
...
@@ -85,6 +91,27 @@ def main(args):
for
i
,
_
in
tqdm
(
enumerate
(
pool
.
imap_unordered
(
func
,
repos
))):
pbar
.
update
()
data
=
[]
with
open
(
args
.
output_file
,
encoding
=
"utf-8"
)
as
f
:
for
idx
,
line
in
enumerate
(
f
):
line
=
line
.
strip
()
data
.
append
(
json
.
loads
(
line
))
random
.
shuffle
(
data
)
n_data
=
len
(
data
)
write_jsonl
(
data
[:
int
(
n_data
*
0.9
)],
path
=
args
.
output_dir
,
mode
=
'train'
)
write_jsonl
(
data
[
int
(
n_data
*
0.9
):
int
(
n_data
*
0.95
)],
path
=
args
.
output_dir
,
mode
=
'validation'
)
write_jsonl
(
data
[
int
(
n_data
*
0.95
):],
path
=
args
.
output_dir
,
mode
=
'test'
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
""
)
...
...
Please
register
or
login
to post a comment