(add) git parser

graykode
Commit 28ef12380c0f5ae960c04aff0301bcd3cdb26850 28ef1238 1 parent f9e5ae18
Showing 1 changed file with 116 additions and 0 deletions
gitparser.py
--- a/gitparser.py 0 → 100644
View file @28ef123
+++ b/gitparser.py 0 → 100644
View file @28ef123
+ # Copyright 2020-present Tae Hwan Jung
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ 
+ import os
+ import json
+ import jsonlines
+ import argparse
+ from tqdm import tqdm
+ from functools import partial
+ from collections import defaultdict
+ from multiprocessing.pool import Pool
+ from transformers import RobertaTokenizer
+ from pydriller import GitRepository, RepositoryMining
+ 
+ 
+ def jobs(repo_paths, args):
+     repo, paths = repo_paths
+     repo_path = os.path.join(args.repos_dir, repo)
+ 
+     if os.path.exists(repo_path):
+         gr = GitRepository(repo_path)
+ 
+         for path in paths:
+             commits = gr.get_commits_modified_file(path)
+             for commit in RepositoryMining(
+                 repo_path, only_commits=commits
+             ).traverse_commits():
+                 message = (commit.msg).split("\n")[0]
+ 
+                 added, deleted = [], []
+                 for mod in commit.modifications:
+                     if mod.new_path == path:
+                         for line, code in mod.diff_parsed["added"]:
+                             added += args.tokenizer.tokenize(code)
+                             assert isinstance(added, list)
+ 
+                         for line, code in mod.diff_parsed["deleted"]:
+                             deleted += args.tokenizer.tokenize(code)
+                             assert isinstance(deleted, list)
+ 
+                         with jsonlines.open(args.output_file, mode="a") as writer:
+                             writer.write(
+                                 {
+                                     "repo": repo,
+                                     "path": path,
+                                     "sha": commit.hash,
+                                     "msg": args.tokenizer.tokenize(message),
+                                     "added": added,
+                                     "deleted": deleted,
+                                 }
+                             )
+ 
+ def main(args):
+     repos = defaultdict(list)
+     with open(args.jsonl_file, encoding="utf-8") as f:
+         for idx, line in enumerate(f):
+             line = line.strip()
+             js = json.loads(line)
+             repos[js["repo"]].append(js["path"])
+ 
+     os.makedirs(args.output_dir, exist_ok=True)
+     args.output_file = os.path.join(args.output_dir, os.path.basename(args.jsonl_file))
+ 
+     func = partial(jobs, args=args)
+     with Pool(processes=args.num_workers) as pool:
+         with tqdm(total=len(repos)) as pbar:
+             for i, _ in tqdm(enumerate(pool.imap_unordered(func, repos.items()))):
+                 pbar.update()
+ 
+ 
+ if __name__ == "__main__":
+     parser = argparse.ArgumentParser(description="")
+     parser.add_argument(
+         "--jsonl_file", type=str, required=True, help="jsonl file path."
+     )
+     parser.add_argument(
+         "--repos_dir",
+         type=str,
+         required=True,
+         help="directory that all repositories will be downloaded.",
+     )
+     parser.add_argument(
+         "--output_dir",
+         type=str,
+         required=True,
+         help="The output directory where the preprocessed data will be written.",
+     )
+     parser.add_argument(
+         "--tokenizer_name",
+         type=str,
+         default="microsoft/codebert-base",
+         help="The name of tokenizer",
+     )
+     parser.add_argument(
+         "--num_workers",
+         default=4,
+         type=int,
+         help="number of process",
+     )
+ 
+     args = parser.parse_args()
+ 
+     args.tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
+ 
+     main(args)