Showing
1 changed file
with
116 additions
and
0 deletions
gitparser.py
0 → 100644
1 | +# Copyright 2020-present Tae Hwan Jung | ||
2 | +# | ||
3 | +# Licensed under the Apache License, Version 2.0 (the "License"); | ||
4 | +# you may not use this file except in compliance with the License. | ||
5 | +# You may obtain a copy of the License at | ||
6 | +# | ||
7 | +# http://www.apache.org/licenses/LICENSE-2.0 | ||
8 | +# | ||
9 | +# Unless required by applicable law or agreed to in writing, software | ||
10 | +# distributed under the License is distributed on an "AS IS" BASIS, | ||
11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
12 | +# See the License for the specific language governing permissions and | ||
13 | +# limitations under the License. | ||
14 | + | ||
15 | +import os | ||
16 | +import json | ||
17 | +import jsonlines | ||
18 | +import argparse | ||
19 | +from tqdm import tqdm | ||
20 | +from functools import partial | ||
21 | +from collections import defaultdict | ||
22 | +from multiprocessing.pool import Pool | ||
23 | +from transformers import RobertaTokenizer | ||
24 | +from pydriller import GitRepository, RepositoryMining | ||
25 | + | ||
26 | + | ||
27 | +def jobs(repo_paths, args): | ||
28 | + repo, paths = repo_paths | ||
29 | + repo_path = os.path.join(args.repos_dir, repo) | ||
30 | + | ||
31 | + if os.path.exists(repo_path): | ||
32 | + gr = GitRepository(repo_path) | ||
33 | + | ||
34 | + for path in paths: | ||
35 | + commits = gr.get_commits_modified_file(path) | ||
36 | + for commit in RepositoryMining( | ||
37 | + repo_path, only_commits=commits | ||
38 | + ).traverse_commits(): | ||
39 | + message = (commit.msg).split("\n")[0] | ||
40 | + | ||
41 | + added, deleted = [], [] | ||
42 | + for mod in commit.modifications: | ||
43 | + if mod.new_path == path: | ||
44 | + for line, code in mod.diff_parsed["added"]: | ||
45 | + added += args.tokenizer.tokenize(code) | ||
46 | + assert isinstance(added, list) | ||
47 | + | ||
48 | + for line, code in mod.diff_parsed["deleted"]: | ||
49 | + deleted += args.tokenizer.tokenize(code) | ||
50 | + assert isinstance(deleted, list) | ||
51 | + | ||
52 | + with jsonlines.open(args.output_file, mode="a") as writer: | ||
53 | + writer.write( | ||
54 | + { | ||
55 | + "repo": repo, | ||
56 | + "path": path, | ||
57 | + "sha": commit.hash, | ||
58 | + "msg": args.tokenizer.tokenize(message), | ||
59 | + "added": added, | ||
60 | + "deleted": deleted, | ||
61 | + } | ||
62 | + ) | ||
63 | + | ||
64 | +def main(args): | ||
65 | + repos = defaultdict(list) | ||
66 | + with open(args.jsonl_file, encoding="utf-8") as f: | ||
67 | + for idx, line in enumerate(f): | ||
68 | + line = line.strip() | ||
69 | + js = json.loads(line) | ||
70 | + repos[js["repo"]].append(js["path"]) | ||
71 | + | ||
72 | + os.makedirs(args.output_dir, exist_ok=True) | ||
73 | + args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file)) | ||
74 | + | ||
75 | + func = partial(jobs, args=args) | ||
76 | + with Pool(processes=args.num_workers) as pool: | ||
77 | + with tqdm(total=len(repos)) as pbar: | ||
78 | + for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))): | ||
79 | + pbar.update() | ||
80 | + | ||
81 | + | ||
82 | +if __name__ == "__main__": | ||
83 | + parser = argparse.ArgumentParser(description="") | ||
84 | + parser.add_argument( | ||
85 | + "--jsonl_file", type=str, required=True, help="jsonl file path." | ||
86 | + ) | ||
87 | + parser.add_argument( | ||
88 | + "--repos_dir", | ||
89 | + type=str, | ||
90 | + required=True, | ||
91 | + help="directory that all repositories will be downloaded.", | ||
92 | + ) | ||
93 | + parser.add_argument( | ||
94 | + "--output_dir", | ||
95 | + type=str, | ||
96 | + required=True, | ||
97 | + help="The output directory where the preprocessed data will be written.", | ||
98 | + ) | ||
99 | + parser.add_argument( | ||
100 | + "--tokenizer_name", | ||
101 | + type=str, | ||
102 | + default="microsoft/codebert-base", | ||
103 | + help="The name of tokenizer", | ||
104 | + ) | ||
105 | + parser.add_argument( | ||
106 | + "--num_workers", | ||
107 | + default=4, | ||
108 | + type=int, | ||
109 | + help="number of process", | ||
110 | + ) | ||
111 | + | ||
112 | + args = parser.parse_args() | ||
113 | + | ||
114 | + args.tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) | ||
115 | + | ||
116 | + main(args) |
-
Please register or login to post a comment