Showing
1 changed file
with
116 additions
and
0 deletions
gitparser.py
0 → 100644
| 1 | +# Copyright 2020-present Tae Hwan Jung | ||
| 2 | +# | ||
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); | ||
| 4 | +# you may not use this file except in compliance with the License. | ||
| 5 | +# You may obtain a copy of the License at | ||
| 6 | +# | ||
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 | ||
| 8 | +# | ||
| 9 | +# Unless required by applicable law or agreed to in writing, software | ||
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, | ||
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| 12 | +# See the License for the specific language governing permissions and | ||
| 13 | +# limitations under the License. | ||
| 14 | + | ||
| 15 | +import os | ||
| 16 | +import json | ||
| 17 | +import jsonlines | ||
| 18 | +import argparse | ||
| 19 | +from tqdm import tqdm | ||
| 20 | +from functools import partial | ||
| 21 | +from collections import defaultdict | ||
| 22 | +from multiprocessing.pool import Pool | ||
| 23 | +from transformers import RobertaTokenizer | ||
| 24 | +from pydriller import GitRepository, RepositoryMining | ||
| 25 | + | ||
| 26 | + | ||
| 27 | +def jobs(repo_paths, args): | ||
| 28 | + repo, paths = repo_paths | ||
| 29 | + repo_path = os.path.join(args.repos_dir, repo) | ||
| 30 | + | ||
| 31 | + if os.path.exists(repo_path): | ||
| 32 | + gr = GitRepository(repo_path) | ||
| 33 | + | ||
| 34 | + for path in paths: | ||
| 35 | + commits = gr.get_commits_modified_file(path) | ||
| 36 | + for commit in RepositoryMining( | ||
| 37 | + repo_path, only_commits=commits | ||
| 38 | + ).traverse_commits(): | ||
| 39 | + message = (commit.msg).split("\n")[0] | ||
| 40 | + | ||
| 41 | + added, deleted = [], [] | ||
| 42 | + for mod in commit.modifications: | ||
| 43 | + if mod.new_path == path: | ||
| 44 | + for line, code in mod.diff_parsed["added"]: | ||
| 45 | + added += args.tokenizer.tokenize(code) | ||
| 46 | + assert isinstance(added, list) | ||
| 47 | + | ||
| 48 | + for line, code in mod.diff_parsed["deleted"]: | ||
| 49 | + deleted += args.tokenizer.tokenize(code) | ||
| 50 | + assert isinstance(deleted, list) | ||
| 51 | + | ||
| 52 | + with jsonlines.open(args.output_file, mode="a") as writer: | ||
| 53 | + writer.write( | ||
| 54 | + { | ||
| 55 | + "repo": repo, | ||
| 56 | + "path": path, | ||
| 57 | + "sha": commit.hash, | ||
| 58 | + "msg": args.tokenizer.tokenize(message), | ||
| 59 | + "added": added, | ||
| 60 | + "deleted": deleted, | ||
| 61 | + } | ||
| 62 | + ) | ||
| 63 | + | ||
| 64 | +def main(args): | ||
| 65 | + repos = defaultdict(list) | ||
| 66 | + with open(args.jsonl_file, encoding="utf-8") as f: | ||
| 67 | + for idx, line in enumerate(f): | ||
| 68 | + line = line.strip() | ||
| 69 | + js = json.loads(line) | ||
| 70 | + repos[js["repo"]].append(js["path"]) | ||
| 71 | + | ||
| 72 | + os.makedirs(args.output_dir, exist_ok=True) | ||
| 73 | + args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file)) | ||
| 74 | + | ||
| 75 | + func = partial(jobs, args=args) | ||
| 76 | + with Pool(processes=args.num_workers) as pool: | ||
| 77 | + with tqdm(total=len(repos)) as pbar: | ||
| 78 | + for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))): | ||
| 79 | + pbar.update() | ||
| 80 | + | ||
| 81 | + | ||
| 82 | +if __name__ == "__main__": | ||
| 83 | + parser = argparse.ArgumentParser(description="") | ||
| 84 | + parser.add_argument( | ||
| 85 | + "--jsonl_file", type=str, required=True, help="jsonl file path." | ||
| 86 | + ) | ||
| 87 | + parser.add_argument( | ||
| 88 | + "--repos_dir", | ||
| 89 | + type=str, | ||
| 90 | + required=True, | ||
| 91 | + help="directory that all repositories will be downloaded.", | ||
| 92 | + ) | ||
| 93 | + parser.add_argument( | ||
| 94 | + "--output_dir", | ||
| 95 | + type=str, | ||
| 96 | + required=True, | ||
| 97 | + help="The output directory where the preprocessed data will be written.", | ||
| 98 | + ) | ||
| 99 | + parser.add_argument( | ||
| 100 | + "--tokenizer_name", | ||
| 101 | + type=str, | ||
| 102 | + default="microsoft/codebert-base", | ||
| 103 | + help="The name of tokenizer", | ||
| 104 | + ) | ||
| 105 | + parser.add_argument( | ||
| 106 | + "--num_workers", | ||
| 107 | + default=4, | ||
| 108 | + type=int, | ||
| 109 | + help="number of process", | ||
| 110 | + ) | ||
| 111 | + | ||
| 112 | + args = parser.parse_args() | ||
| 113 | + | ||
| 114 | + args.tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) | ||
| 115 | + | ||
| 116 | + main(args) |
-
Please register or login to post a comment