graykode

(add) git parser

1 +# Copyright 2020-present Tae Hwan Jung
2 +#
3 +# Licensed under the Apache License, Version 2.0 (the "License");
4 +# you may not use this file except in compliance with the License.
5 +# You may obtain a copy of the License at
6 +#
7 +# http://www.apache.org/licenses/LICENSE-2.0
8 +#
9 +# Unless required by applicable law or agreed to in writing, software
10 +# distributed under the License is distributed on an "AS IS" BASIS,
11 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 +# See the License for the specific language governing permissions and
13 +# limitations under the License.
14 +
15 +import os
16 +import json
17 +import jsonlines
18 +import argparse
19 +from tqdm import tqdm
20 +from functools import partial
21 +from collections import defaultdict
22 +from multiprocessing.pool import Pool
23 +from transformers import RobertaTokenizer
24 +from pydriller import GitRepository, RepositoryMining
25 +
26 +
27 +def jobs(repo_paths, args):
28 + repo, paths = repo_paths
29 + repo_path = os.path.join(args.repos_dir, repo)
30 +
31 + if os.path.exists(repo_path):
32 + gr = GitRepository(repo_path)
33 +
34 + for path in paths:
35 + commits = gr.get_commits_modified_file(path)
36 + for commit in RepositoryMining(
37 + repo_path, only_commits=commits
38 + ).traverse_commits():
39 + message = (commit.msg).split("\n")[0]
40 +
41 + added, deleted = [], []
42 + for mod in commit.modifications:
43 + if mod.new_path == path:
44 + for line, code in mod.diff_parsed["added"]:
45 + added += args.tokenizer.tokenize(code)
46 + assert isinstance(added, list)
47 +
48 + for line, code in mod.diff_parsed["deleted"]:
49 + deleted += args.tokenizer.tokenize(code)
50 + assert isinstance(deleted, list)
51 +
52 + with jsonlines.open(args.output_file, mode="a") as writer:
53 + writer.write(
54 + {
55 + "repo": repo,
56 + "path": path,
57 + "sha": commit.hash,
58 + "msg": args.tokenizer.tokenize(message),
59 + "added": added,
60 + "deleted": deleted,
61 + }
62 + )
63 +
64 +def main(args):
65 + repos = defaultdict(list)
66 + with open(args.jsonl_file, encoding="utf-8") as f:
67 + for idx, line in enumerate(f):
68 + line = line.strip()
69 + js = json.loads(line)
70 + repos[js["repo"]].append(js["path"])
71 +
72 + os.makedirs(args.output_dir, exist_ok=True)
73 + args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file))
74 +
75 + func = partial(jobs, args=args)
76 + with Pool(processes=args.num_workers) as pool:
77 + with tqdm(total=len(repos)) as pbar:
78 + for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))):
79 + pbar.update()
80 +
81 +
82 +if __name__ == "__main__":
83 + parser = argparse.ArgumentParser(description="")
84 + parser.add_argument(
85 + "--jsonl_file", type=str, required=True, help="jsonl file path."
86 + )
87 + parser.add_argument(
88 + "--repos_dir",
89 + type=str,
90 + required=True,
91 + help="directory that all repositories will be downloaded.",
92 + )
93 + parser.add_argument(
94 + "--output_dir",
95 + type=str,
96 + required=True,
97 + help="The output directory where the preprocessed data will be written.",
98 + )
99 + parser.add_argument(
100 + "--tokenizer_name",
101 + type=str,
102 + default="microsoft/codebert-base",
103 + help="The name of tokenizer",
104 + )
105 + parser.add_argument(
106 + "--num_workers",
107 + default=4,
108 + type=int,
109 + help="number of process",
110 + )
111 +
112 + args = parser.parse_args()
113 +
114 + args.tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
115 +
116 + main(args)