Showing
3 changed files
with
37 additions
and
18 deletions
| ... | @@ -46,17 +46,15 @@ Recommended Commit Message : Remove unused imports | ... | @@ -46,17 +46,15 @@ Recommended Commit Message : Remove unused imports |
| 46 | To solve this problem, use a new embedding called [`patch_type_embeddings`](https://github.com/graykode/commit-autosuggestions/blob/master/commit/model/diff_roberta.py#L40) that can distinguish added and deleted, just as the sample et al, 2019 (XLM) used language embeddeding. (1 for added, 2 for deleted.) | 46 | To solve this problem, use a new embedding called [`patch_type_embeddings`](https://github.com/graykode/commit-autosuggestions/blob/master/commit/model/diff_roberta.py#L40) that can distinguish added and deleted, just as the sample et al, 2019 (XLM) used language embeddeding. (1 for added, 2 for deleted.) |
| 47 | 47 | ||
| 48 | ### Language support | 48 | ### Language support |
| 49 | -| Language | Added | Diff | | 49 | +| Language | Added | Diff | Data(Diff) | Weights | |
| 50 | -| :------------- | :---: | :---:| | 50 | +| :------------- | :---: | :---:| :---: | :---:| |
| 51 | -| Python | ✅ | ✅ | | 51 | +| Python | ✅ | ✅ | [link](https://drive.google.com/drive/folders/1_8lQmzTH95Nc-4MKd1RP3x4BVc8tBA6W?usp=sharing) | [link](https://drive.google.com/drive/folders/1OwM7_FiLiwVJAhAanBPWtPw3Hz3Dszbh?usp=sharing) | |
| 52 | -| JavaScript | ⬜ | ⬜ | | 52 | +| JavaScript | ⬜ | ⬜ | ⬜ | ⬜ | |
| 53 | -| Go | ⬜ | ⬜ | | 53 | +| Go | ⬜ | ⬜ | ⬜ | ⬜ | |
| 54 | -| JAVA | ⬜ | ⬜ | | 54 | +| JAVA | ⬜ | ⬜ | ⬜ | ⬜ | |
| 55 | -| Ruby | ⬜ | ⬜ | | 55 | +| Ruby | ⬜ | ⬜ | ⬜ | ⬜ | |
| 56 | -| PHP | ⬜ | ⬜ | | 56 | +| PHP | ⬜ | ⬜ | ⬜ | ⬜ | |
| 57 | * ✅ — Supported | 57 | * ✅ — Supported |
| 58 | -* 🔶 — Partial support | ||
| 59 | -* 🚧 — Under development | ||
| 60 | * ⬜ - N/A ️ | 58 | * ⬜ - N/A ️ |
| 61 | 59 | ||
| 62 | We plan to slowly conquer languages that are not currently supported. However, I also need to use expensive GPU instances of AWS or GCP to train about the above languages. Please do a simple sponsor for this! | 60 | We plan to slowly conquer languages that are not currently supported. However, I also need to use expensive GPU instances of AWS or GCP to train about the above languages. Please do a simple sponsor for this! |
| ... | @@ -68,9 +66,18 @@ To run this project, you need a flask-based inference server (GPU) and a client | ... | @@ -68,9 +66,18 @@ To run this project, you need a flask-based inference server (GPU) and a client |
| 68 | Prepare Docker and Nvidia-docker before running the server. | 66 | Prepare Docker and Nvidia-docker before running the server. |
| 69 | 67 | ||
| 70 | ##### 1-a. If you have GPU machine. | 68 | ##### 1-a. If you have GPU machine. |
| 71 | -Serve flask server with Nvidia Docker | 69 | +Serve flask server with Nvidia Docker. Check the docker tag for programming language in [here](https://hub.docker.com/repository/registry-1.docker.io/graykode/commit-autosuggestions/tags). |
| 70 | +| Language | Tag | | ||
| 71 | +| :------------- | :---: | | ||
| 72 | +| Python | py | | ||
| 73 | +| JavaScript | js | | ||
| 74 | +| Go | go | | ||
| 75 | +| JAVA | java | | ||
| 76 | +| Ruby | ruby | | ||
| 77 | +| PHP | php | | ||
| 78 | + | ||
| 72 | ```shell script | 79 | ```shell script |
| 73 | -$ docker run -it --gpus 0 -p 5000:5000 commit-autosuggestions:0.1-gpu | 80 | +$ docker run -it -d --gpus 0 -p 5000:5000 graykode/commit-autosuggestions:{language} |
| 74 | ``` | 81 | ``` |
| 75 | 82 | ||
| 76 | ##### 1-b. If you don't have GPU machine. | 83 | ##### 1-b. If you don't have GPU machine. | ... | ... |
| ... | @@ -10,14 +10,14 @@ ARG ADDED_MODEL="1YrkwfM-0VBCJaa9NYaXUQPODdGPsmQY4" | ... | @@ -10,14 +10,14 @@ ARG ADDED_MODEL="1YrkwfM-0VBCJaa9NYaXUQPODdGPsmQY4" |
| 10 | ARG DIFF_MODEL="1--gcVVix92_Fp75A-mWH0pJS0ahlni5m" | 10 | ARG DIFF_MODEL="1--gcVVix92_Fp75A-mWH0pJS0ahlni5m" |
| 11 | 11 | ||
| 12 | RUN git clone https://github.com/graykode/commit-autosuggestions.git /app/commit-autosuggestions \ | 12 | RUN git clone https://github.com/graykode/commit-autosuggestions.git /app/commit-autosuggestions \ |
| 13 | - && cd /app/commit-autosuggestions && python3 setup.py install | 13 | + && cd /app/commit-autosuggestions |
| 14 | 14 | ||
| 15 | WORKDIR /app/commit-autosuggestions | 15 | WORKDIR /app/commit-autosuggestions |
| 16 | 16 | ||
| 17 | RUN pip3 install ${PYTORCH_WHEEL} gdown | 17 | RUN pip3 install ${PYTORCH_WHEEL} gdown |
| 18 | -RUN gdown https://drive.google.com/uc?id=${ADDED_MODEL} -O weight/added/ | 18 | +RUN gdown https://drive.google.com/uc?id=${ADDED_MODEL} -O weight/python/added/ |
| 19 | -RUN gdown https://drive.google.com/uc?id=${DIFF_MODEL} -O weight/diff/ | 19 | +RUN gdown https://drive.google.com/uc?id=${DIFF_MODEL} -O weight/python/diff/ |
| 20 | 20 | ||
| 21 | RUN pip3 install -r requirements.txt | 21 | RUN pip3 install -r requirements.txt |
| 22 | 22 | ||
| 23 | -ENTRYPOINT ["python3", "app.py"] | 23 | +ENTRYPOINT ["python3", "app.py", "--load_model_path", "./weights/python/"] | ... | ... |
| ... | @@ -24,6 +24,15 @@ from multiprocessing.pool import Pool | ... | @@ -24,6 +24,15 @@ from multiprocessing.pool import Pool |
| 24 | from transformers import RobertaTokenizer | 24 | from transformers import RobertaTokenizer |
| 25 | from pydriller import RepositoryMining | 25 | from pydriller import RepositoryMining |
| 26 | 26 | ||
| 27 | +language = { | ||
| 28 | + 'py' : ['.py'], | ||
| 29 | + 'js' : ['.js', '.ts'], | ||
| 30 | + 'go' : ['.go'], | ||
| 31 | + 'java' : ['.java'], | ||
| 32 | + 'ruby' : ['.rb'], | ||
| 33 | + 'php' : ['.php'] | ||
| 34 | +} | ||
| 35 | + | ||
| 27 | def message_cleaner(message): | 36 | def message_cleaner(message): |
| 28 | msg = message.split("\n")[0] | 37 | msg = message.split("\n")[0] |
| 29 | msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg) | 38 | msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg) |
| ... | @@ -34,7 +43,7 @@ def jobs(repo, args): | ... | @@ -34,7 +43,7 @@ def jobs(repo, args): |
| 34 | repo_path = os.path.join(args.repos_dir, repo) | 43 | repo_path = os.path.join(args.repos_dir, repo) |
| 35 | if os.path.exists(repo_path): | 44 | if os.path.exists(repo_path): |
| 36 | for commit in RepositoryMining( | 45 | for commit in RepositoryMining( |
| 37 | - repo_path, only_modifications_with_file_types=['.py'] | 46 | + repo_path, only_modifications_with_file_types=language[args.lang] |
| 38 | ).traverse_commits(): | 47 | ).traverse_commits(): |
| 39 | cleaned_message = message_cleaner(commit.msg) | 48 | cleaned_message = message_cleaner(commit.msg) |
| 40 | tokenized_message = args.tokenizer.tokenize(cleaned_message) | 49 | tokenized_message = args.tokenizer.tokenize(cleaned_message) |
| ... | @@ -44,7 +53,7 @@ def jobs(repo, args): | ... | @@ -44,7 +53,7 @@ def jobs(repo, args): |
| 44 | for mod in commit.modifications: | 53 | for mod in commit.modifications: |
| 45 | if not (mod.old_path and mod.new_path): | 54 | if not (mod.old_path and mod.new_path): |
| 46 | continue | 55 | continue |
| 47 | - if os.path.splitext(mod.new_path)[1] != '.py': | 56 | + if os.path.splitext(mod.new_path)[1] not in language[args.lang]: |
| 48 | continue | 57 | continue |
| 49 | if not mod.diff_parsed["added"]: | 58 | if not mod.diff_parsed["added"]: |
| 50 | continue | 59 | continue |
| ... | @@ -121,6 +130,9 @@ if __name__ == "__main__": | ... | @@ -121,6 +130,9 @@ if __name__ == "__main__": |
| 121 | help="directory that all repositories had been downloaded.",) | 130 | help="directory that all repositories had been downloaded.",) |
| 122 | parser.add_argument("--output_dir", type=str, required=True, | 131 | parser.add_argument("--output_dir", type=str, required=True, |
| 123 | help="The output directory where the preprocessed data will be written.") | 132 | help="The output directory where the preprocessed data will be written.") |
| 133 | + parser.add_argument("--lang", type=str, required=True, | ||
| 134 | + choices=['py', 'js', 'go', 'java', 'ruby', 'php'], | ||
| 135 | + help="The output directory where the preprocessed data will be written.") | ||
| 124 | parser.add_argument("--tokenizer_name", type=str, | 136 | parser.add_argument("--tokenizer_name", type=str, |
| 125 | default="microsoft/codebert-base", help="The name of tokenizer",) | 137 | default="microsoft/codebert-base", help="The name of tokenizer",) |
| 126 | parser.add_argument("--num_workers", default=4, type=int, help="number of process") | 138 | parser.add_argument("--num_workers", default=4, type=int, help="number of process") | ... | ... |
-
Please register or login to post a comment