reinit

graykode
Commit 4d064d9b3614f64847c1b060071ee175e69f75a8 4d064d9b 1 parent 042ef27a
Showing 20 changed files with 410 additions and 286 deletions
.gitignore
autocommit/__init__.py
autocommit/app.py
autocommit/commit.py
autocommit/model/__init__.py
src/train/customized_roberta.py → autocommit/model/diff_roberta.py
src/train/model.py → autocommit/model/model.py
autocommit/utils.py
autocommit/weight/added/.keep
autocommit/weight/diff/.keep
src/train/bleu.py → bleu.py
src/preprocess/gitcloner.py → gitcloner.py
src/preprocess/gitparser.py → gitparser.py
src/preprocess/repositories.txt → repositories.txt
requirements.txt
src/api.py
src/test.source
test/add.diff
test/fixed.diff
src/train/run.py → train.py
--- a/.gitignore
View file @4d064d9
+++ b/.gitignore
View file @4d064d9
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.bin
 # C extensions
 *.so
--- a/autocommit/__init__.py 0 → 100644
View file @4d064d9
+++ b/autocommit/__init__.py 0 → 100644
View file @4d064d9
+# Copyright 2020-present Tae Hwan Jung
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/autocommit/app.py 0 → 100644
View file @4d064d9
+++ b/autocommit/app.py 0 → 100644
View file @4d064d9
+# Copyright 2020-present Tae Hwan Jung
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+import argparse
+import whatthepatch
+from tqdm import tqdm
+import torch.nn as nn
+from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
+from transformers import (RobertaConfig, RobertaTokenizer)
+
+from autocommit.model import Seq2Seq
+from autocommit.utils import (Example, convert_examples_to_features)
+from autocommit.model.diff_roberta import RobertaModel
+
+from flask import Flask, jsonify, request
+
+app = Flask(__name__)
+
+MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
+
+def get_model(model_class, config, tokenizer, mode):
+    encoder = model_class(config=config)
+    decoder_layer = nn.TransformerDecoderLayer(
+        d_model=config.hidden_size, nhead=config.num_attention_heads
+    )
+    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+    model = Seq2Seq(encoder=encoder, decoder=decoder, config=config,
+            beam_size=args.beam_size, max_length=args.max_target_length,
+            sos_id=tokenizer.cls_token_id, eos_id=tokenizer.sep_token_id)
+
+    assert args.load_model_path
+    assert os.path.exists(os.path.join(args.load_model_path, mode, 'pytorch_model.bin'))
+
+    model.load_state_dict(
+        torch.load(
+            os.path.join(args.load_model_path, mode, 'pytorch_model.bin'),
+            map_location=torch.device(args.device)
+        ),
+        strict=False
+    )
+    return model
+
+def get_features(examples):
+    features = convert_examples_to_features(examples, args.tokenizer, args, stage='test')
+    all_source_ids = torch.tensor([f.source_ids for f in features], dtype=torch.long)
+    all_source_mask = torch.tensor([f.source_mask for f in features], dtype=torch.long)
+    all_patch_ids = torch.tensor([f.patch_ids for f in features], dtype=torch.long)
+    return TensorDataset(all_source_ids, all_source_mask, all_patch_ids)
+
+def create_app():
+    @app.route('/')
+    def index():
+        return jsonify(hello="world")
+
+    @app.route('/added', methods=['POST'])
+    def added():
+        if request.method == 'POST':
+            payload = request.get_json()
+            example = [
+                Example(
+                    idx=payload['idx'],
+                    added=payload['added'],
+                    deleted=payload['deleted'],
+                    target=None
+                )
+            ]
+            message = inference(model=args.added_model, data=get_features(example))
+            return jsonify(idx=payload['idx'], message=message)
+
+    @app.route('/diff', methods=['POST'])
+    def diff():
+        if request.method == 'POST':
+            payload = request.get_json()
+            example = [
+                Example(
+                    idx=payload['idx'],
+                    added=payload['added'],
+                    deleted=payload['deleted'],
+                    target=None
+                )
+            ]
+            message = inference(model=args.diff_model, data=get_features(example))
+            return jsonify(idx=payload['idx'], message=message)
+
+    @app.route('/tokenizer', methods=['POST'])
+    def tokenizer():
+        if request.method == 'POST':
+            payload = request.get_json()
+            tokens = args.tokenizer.tokenize(payload['line'])
+            return jsonify(tokens=tokens)
+
+    return app
+
+def inference(model, data):
+    # Calculate bleu
+    eval_sampler = SequentialSampler(data)
+    eval_dataloader = DataLoader(data, sampler=eval_sampler, batch_size=len(data))
+
+    model.eval()
+    p=[]
+    for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
+        batch = tuple(t.to(args.device) for t in batch)
+        source_ids, source_mask, patch_ids = batch
+        with torch.no_grad():
+            preds = model(source_ids=source_ids, source_mask=source_mask, patch_ids=patch_ids)
+            for pred in preds:
+                t = pred[0].cpu().numpy()
+                t = list(t)
+                if 0 in t:
+                    t = t[:t.index(0)]
+                text = args.tokenizer.decode(t, clean_up_tokenization_spaces=False)
+                p.append(text)
+    return p
+
+def main(args):
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name)
+    args.tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, do_lower_case=args.do_lower_case)
+
+    # budild model
+    args.added_model =get_model(model_class=model_class, config=config,
+                            tokenizer=args.tokenizer, mode='added').to(args.device)
+    args.diff_model = get_model(model_class=model_class, config=config,
+                            tokenizer=args.tokenizer, mode='diff').to(args.device)
+
+    app = create_app()
+    app.run(host=args.host, debug=True, port=args.port)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--load_model_path", default='weight', type=str,
+                        help="Path to trained model: Should contain the .bin files")
+
+    parser.add_argument("--model_type", default='roberta', type=str,
+                        help="Model type: e.g. roberta")
+    parser.add_argument("--config_name", default="microsoft/codebert-base", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", type=str,
+                        default="microsoft/codebert-base", help="The name of tokenizer", )
+    parser.add_argument("--max_source_length", default=256, type=int,
+                        help="The maximum total source sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--max_target_length", default=128, type=int,
+                        help="The maximum total target sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--beam_size", default=10, type=int,
+                        help="beam size for beam search")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=5000)
+
+    args = parser.parse_args()
+
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+
+    main(args)
\ No newline at end of file
--- a/autocommit/commit.py 0 → 100644
View file @4d064d9
+++ b/autocommit/commit.py 0 → 100644
View file @4d064d9
+# Copyright 2020-present Tae Hwan Jung
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import whatthepatch
+
+def preprocessing(diff):
+    added_examples, diff_examples = [], []
+    isadded, isdeleted = False, False
+    for idx, example in enumerate(whatthepatch.parse_patch(diff)):
+        added, deleted = [], []
+        for change in example.changes:
+            if change.old == None and change.new != None:
+                added.extend(tokenizer.tokenize(change.line))
+                isadded = True
+            elif change.old != None and change.new == None:
+                deleted.extend(tokenizer.tokenize(change.line))
+                isdeleted = True
+
+    if isadded and isdeleted:
+        pass
+    else:
+        pass
+
+def main():
+    proc = subprocess.Popen(["git", "diff", "--cached"], stdout=subprocess.PIPE)
+    staged_files = proc.stdout.readlines()
+    staged_files = [f.decode("utf-8") for f in staged_files]
+    staged_files = [f.strip() for f in staged_files]
+    diffs = "\n".join(staged_files)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/autocommit/model/__init__.py 0 → 100644
View file @4d064d9
+++ b/autocommit/model/__init__.py 0 → 100644
View file @4d064d9
+# Copyright 2020-present Tae Hwan Jung
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from autocommit.model.diff_roberta import RobertaModel
+from autocommit.model.model import Seq2Seq
+
+__all__ = [
+    'RobertaModel',
+    'Seq2Seq'
+]
\ No newline at end of file
--- a/src/train/customized_roberta.py → autocommit/model/diff_roberta.py
View file @4d064d9
+++ b/src/train/customized_roberta.py → autocommit/model/diff_roberta.py
View file @4d064d9
--- a/src/train/model.py → autocommit/model/model.py
View file @4d064d9
+++ b/src/train/model.py → autocommit/model/model.py
View file @4d064d9
--- a/autocommit/utils.py 0 → 100644
View file @4d064d9
+++ b/autocommit/utils.py 0 → 100644
View file @4d064d9
+# Copyright 2020-present Tae Hwan Jung
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+class Example(object):
+    """A single training/test example."""
+    def __init__(self,
+                 idx,
+                 added,
+                 deleted,
+                 target,
+                 ):
+        self.idx = idx
+        self.added = added
+        self.deleted = deleted
+        self.target = target
+
+class InputFeatures(object):
+    """A single training/test features for a example."""
+    def __init__(self,
+                 example_id,
+                 source_ids,
+                 target_ids,
+                 source_mask,
+                 target_mask,
+                 patch_ids,
+
+    ):
+        self.example_id = example_id
+        self.source_ids = source_ids
+        self.target_ids = target_ids
+        self.source_mask = source_mask
+        self.target_mask = target_mask
+        self.patch_ids = patch_ids
+
+def convert_examples_to_features(examples, tokenizer, args, stage=None):
+    features = []
+    for example_index, example in enumerate(examples):
+        # source
+        added_tokens = [tokenizer.cls_token] + example.added + [tokenizer.sep_token]
+        deleted_tokens = example.deleted + [tokenizer.sep_token]
+        source_tokens = added_tokens + deleted_tokens
+        patch_ids = [1] * len(added_tokens) + [2] * len(deleted_tokens)
+        source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
+        source_mask = [1] * (len(source_tokens))
+        padding_length = args.max_source_length - len(source_ids)
+        source_ids += [tokenizer.pad_token_id] * padding_length
+        patch_ids += [0] * padding_length
+        source_mask += [0] * padding_length
+
+        # target
+        if stage == "test":
+            target_tokens = tokenizer.tokenize("None")
+        else:
+            target_tokens = (example.target)[:args.max_target_length - 2]
+        target_tokens = [tokenizer.cls_token] + target_tokens + [tokenizer.sep_token]
+        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
+        target_mask = [1] * len(target_ids)
+        padding_length = args.max_target_length - len(target_ids)
+        target_ids += [tokenizer.pad_token_id] * padding_length
+        target_mask += [0] * padding_length
+
+        if example_index < 5:
+            if stage == 'train':
+                logger.info("*** Example ***")
+                logger.info("idx: {}".format(example.idx))
+
+                logger.info("source_tokens: {}".format([x.replace('\u0120', '_') for x in source_tokens]))
+                logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
+                logger.info("patch_ids: {}".format(' '.join(map(str, patch_ids))))
+                logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
+
+                logger.info("target_tokens: {}".format([x.replace('\u0120', '_') for x in target_tokens]))
+                logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
+                logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))
+
+        features.append(
+            InputFeatures(
+                example_index,
+                source_ids,
+                target_ids,
+                source_mask,
+                target_mask,
+                patch_ids,
+            )
+        )
+
+    return features
--- a/autocommit/weight/added/.keep 0 → 100644
View file @4d064d9
+++ b/autocommit/weight/added/.keep 0 → 100644
View file @4d064d9
--- a/autocommit/weight/diff/.keep 0 → 100644
View file @4d064d9
+++ b/autocommit/weight/diff/.keep 0 → 100644
View file @4d064d9
--- a/src/train/bleu.py → bleu.py
View file @4d064d9
+++ b/src/train/bleu.py → bleu.py
View file @4d064d9
--- a/src/preprocess/gitcloner.py → gitcloner.py
View file @4d064d9
+++ b/src/preprocess/gitcloner.py → gitcloner.py
View file @4d064d9
--- a/src/preprocess/gitparser.py → gitparser.py
View file @4d064d9
+++ b/src/preprocess/gitparser.py → gitparser.py
View file @4d064d9
--- a/src/preprocess/repositories.txt → repositories.txt
View file @4d064d9
+++ b/src/preprocess/repositories.txt → repositories.txt
View file @4d064d9
--- a/requirements.txt
View file @4d064d9
+++ b/requirements.txt
View file @4d064d9
 whatthepatch
 gitpython
-matorage
-transformers
 packaging
-
-psutil
-sacrebleu
-pyarrow>=0.16.0
-rouge-score
-pytorch-lightning==0.8.5
-pytest
\ No newline at end of file
--- a/src/api.py deleted 100644 → 0
View file @042ef27
+++ b/src/api.py deleted 100644 → 0
View file @042ef27
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import torch
-import logging
-from tqdm import tqdm
-import torch.nn as nn
-from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
-from transformers import (RobertaConfig, RobertaTokenizer)
-
-import argparse
-import whatthepatch
-from train.run import (Example, convert_examples_to_features)
-from train.model import Seq2Seq
-from train.customized_roberta import RobertaModel
-
-MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-def create_examples(diff, tokenizer):
-    examples = []
-    for idx, example in enumerate(whatthepatch.parse_patch(diff)):
-        added, deleted = [], []
-        for change in example.changes:
-            if change.old == None and change.new != None:
-                added.extend(tokenizer.tokenize(change.line))
-            elif change.old != None and change.new == None:
-                deleted.extend(tokenizer.tokenize(change.line))
-        examples.append(
-            Example(
-                idx=idx,
-                added=added,
-                deleted=deleted,
-                target=None
-            )
-        )
-
-    return examples
-
-def main(args):
-
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, do_lower_case=args.do_lower_case)
-
-    # budild model
-    encoder = model_class(config=config)
-    decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
-    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
-    model = Seq2Seq(encoder=encoder, decoder=decoder, config=config,
-                    beam_size=args.beam_size, max_length=args.max_target_length,
-                    sos_id=tokenizer.cls_token_id, eos_id=tokenizer.sep_token_id)
-    if args.load_model_path is not None:
-        logger.info("reload model from {}".format(args.load_model_path))
-        model.load_state_dict(torch.load(args.load_model_path), strict=False)
-
-    model.to(args.device)
-    with open("test.source", "r") as f:
-        eval_examples = create_examples(f.read(), tokenizer)
-
-    test_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='test')
-    all_source_ids = torch.tensor([f.source_ids for f in test_features], dtype=torch.long)
-    all_source_mask = torch.tensor([f.source_mask for f in test_features], dtype=torch.long)
-    all_patch_ids = torch.tensor([f.patch_ids for f in test_features], dtype=torch.long)
-    test_data = TensorDataset(all_source_ids, all_source_mask, all_patch_ids)
-
-    # Calculate bleu
-    eval_sampler = SequentialSampler(test_data)
-    eval_dataloader = DataLoader(test_data, sampler=eval_sampler, batch_size=len(test_data))
-
-    model.eval()
-    for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
-        batch = tuple(t.to(args.device) for t in batch)
-        source_ids, source_mask, patch_ids = batch
-        with torch.no_grad():
-            preds = model(source_ids=source_ids, source_mask=source_mask, patch_ids=patch_ids)
-            for pred in preds:
-                t = pred[0].cpu().numpy()
-                t = list(t)
-                if 0 in t:
-                    t = t[:t.index(0)]
-                text = tokenizer.decode(t, clean_up_tokenization_spaces=False)
-                print(text)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--load_model_path", default=None, type=str, required=True,
-                        help="Path to trained model: Should contain the .bin files")
-
-    parser.add_argument("--model_type", default='roberta', type=str,
-                        help="Model type: e.g. roberta")
-    parser.add_argument("--config_name", default="microsoft/codebert-base", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", type=str,
-                        default="microsoft/codebert-base", help="The name of tokenizer", )
-    parser.add_argument("--max_source_length", default=256, type=int,
-                        help="The maximum total source sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--max_target_length", default=128, type=int,
-                        help="The maximum total target sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--beam_size", default=10, type=int,
-                        help="beam size for beam search")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-
-    args = parser.parse_args()
-
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-
-    main(args)
\ No newline at end of file
--- a/src/test.source deleted 100644 → 0
View file @042ef27
+++ b/src/test.source deleted 100644 → 0
View file @042ef27
-diff --git a/src/train/model.py b/src/train/model.py
-index 20e56b3..cab82e5 100644
---- a/src/train/model.py
-+++ b/src/train/model.py
-@@ -3,9 +3,7 @@
- 
- import torch
- import torch.nn as nn
--import torch
--from torch.autograd import Variable
--import copy
-+
- class Seq2Seq(nn.Module):
-     """
-         Build Seqence-to-Sequence.
-diff --git a/src/train/run.py b/src/train/run.py
-index 5961ad1..be98fec 100644
---- a/src/train/run.py
-+++ b/src/train/run.py
-@@ -22,7 +22,6 @@ using a masked language modeling (MLM) loss.
- from __future__ import absolute_import
- import os
- import sys
--import bleu
- import pickle
- import torch
- import json
-@@ -35,11 +34,14 @@ from itertools import cycle
- import torch.nn as nn
- from model import Seq2Seq
- from tqdm import tqdm, trange
--from customized_roberta import RobertaModel
- from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
- from torch.utils.data.distributed import DistributedSampler
- from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
-                           RobertaConfig, RobertaTokenizer)
-+
-+import train.bleu as bleu
-+from train.customized_roberta import RobertaModel
-+
- MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
- 
- logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
--- a/test/add.diff 0 → 100644
View file @4d064d9
+++ b/test/add.diff 0 → 100644
View file @4d064d9
+diff --git a/codebert/code.py b/codebert/code.py
+new file mode 100644
+index 0000000..b4bc953
+--- /dev/null
++++ b/codebert/code.py
+@@ -0,0 +1,21 @@
++def dailymotion_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
++
++    html = get_content(rebuilt_url(url))
++    info = json.loads(match1(html, r'qualities":({.+?}),"'))
++    title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \
++            match1(html, r'"title"\s*:\s*"([^"]+)"')
++    title = unicodize(title)
++
++    for quality in ['1080','720','480','380','240','144','auto']:
++        try:
++            real_url = info[quality][1]["url"]
++            if real_url:
++                break
++        except KeyError:
++            pass
++
++    mime, ext, size = url_info(real_url)
++
++    print_info(site_info, title, mime, size)
++    if not info_only:
++        download_urls([real_url], title, ext, size, output_dir=output_dir, merge=merge)
\ No newline at end of file
--- a/test/fixed.diff 0 → 100644
View file @4d064d9
+++ b/test/fixed.diff 0 → 100644
View file @4d064d9
+diff --git a/src/train/model.py b/src/train/model.py
+index 20e56b3..cab82e5 100644
+--- a/src/train/model.py
++++ b/src/train/model.py
+@@ -3,9 +3,7 @@
+
+ import torch
+ import torch.nn as nn
+-import torch
+-from torch.autograd import Variable
+-import copy
++
+ class Seq2Seq(nn.Module):
+     """
+         Build Seqence-to-Sequence.
\ No newline at end of file
--- a/src/train/run.py → train.py
View file @4d064d9
+++ b/src/train/run.py → train.py
View file @4d064d9
@@ -21,8 +21,6 @@ using a masked language modeling (MLM) loss.
 from __future__ import absolute_import
 import os
-import sys
-import pickle
 import torch
 import json
 import random
@@ -30,17 +28,17 @@ import logging
 import argparse
 import numpy as np
 from io import open
-from itertools import cycle
+from tqdm import tqdm
 import torch.nn as nn
-from model import Seq2Seq
+from itertools import cycle
-from tqdm import tqdm, trange
+
-from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
+from torch.utils.data import (DataLoader, SequentialSampler, RandomSampler, TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
-from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
+from transformers import (AdamW, get_linear_schedule_with_warmup, RobertaConfig, RobertaTokenizer)
-                          RobertaConfig, RobertaTokenizer)
-import train.bleu as bleu
+import bleu
-from train.customized_roberta import RobertaModel
+from autocommit.model import Seq2Seq, RobertaModel
+from autocommit.utils import (convert_examples_to_features, Example)
 MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
@@ -49,19 +47,6 @@ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(messa
                     level = logging.INFO)
 logger = logging.getLogger(__name__)
-class Example(object):
-    """A single training/test example."""
-    def __init__(self,
-                 idx,
-                 added,
-                 deleted,
-                 target,
-                 ):
-        self.idx = idx
-        self.added = added
-        self.deleted = deleted
-        self.target = target
-
 def read_examples(filename):
     """Read examples from filename."""
     examples=[]
@@ -82,85 +67,6 @@ def read_examples(filename):
     return examples
-class InputFeatures(object):
-    """A single training/test features for a example."""
-    def __init__(self,
-                 example_id,
-                 source_ids,
-                 target_ids,
-                 source_mask,
-                 target_mask,
-                 patch_ids,
-
-    ):
-        self.example_id = example_id
-        self.source_ids = source_ids
-        self.target_ids = target_ids
-        self.source_mask = source_mask
-        self.target_mask = target_mask
-        self.patch_ids = patch_ids
-        
-
-
-def convert_examples_to_features(examples, tokenizer, args,stage=None):
-    features = []
-    for example_index, example in enumerate(examples):
-        #source
-        added_tokens=[tokenizer.cls_token]+example.added+[tokenizer.sep_token]
-        deleted_tokens=example.deleted+[tokenizer.sep_token]
-        source_tokens = added_tokens + deleted_tokens
-        patch_ids = [1] * len(added_tokens) + [2] * len(deleted_tokens)
-        source_ids =  tokenizer.convert_tokens_to_ids(source_tokens)
-        source_mask = [1] * (len(source_tokens))
-        padding_length = args.max_source_length - len(source_ids)
-        source_ids+=[tokenizer.pad_token_id]*padding_length
-        patch_ids+=[0]*padding_length
-        source_mask+=[0]*padding_length
-
-        assert len(source_ids) == args.max_source_length
-        assert len(source_mask) == args.max_source_length
-        assert len(patch_ids) == args.max_source_length
-
-        #target
-        if stage=="test":
-            target_tokens = tokenizer.tokenize("None")
-        else:
-            target_tokens = (example.target)[:args.max_target_length-2]
-        target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
-        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
-        target_mask = [1] *len(target_ids)
-        padding_length = args.max_target_length - len(target_ids)
-        target_ids+=[tokenizer.pad_token_id]*padding_length
-        target_mask+=[0]*padding_length   
-   
-        if example_index < 5:
-            if stage=='train':
-                logger.info("*** Example ***")
-                logger.info("idx: {}".format(example.idx))
-
-                logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
-                logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
-                logger.info("patch_ids: {}".format(' '.join(map(str, patch_ids))))
-                logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
-                
-                logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
-                logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
-                logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))
-       
-        features.append(
-            InputFeatures(
-                 example_index,
-                 source_ids,
-                 target_ids,
-                 source_mask,
-                 target_mask,
-                 patch_ids,
-            )
-        )
-    return features
-
-
-
 def set_seed(args):
     """set random seed."""
     random.seed(args.seed)
@@ -471,7 +377,7 @@ def main():
                         f1.write(str(gold.idx)+'\t'+' '.join(gold.target)+'\n')
                 (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "dev.gold")) 
-                dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
+                dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0], 2)
                 logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
                 logger.info("  "+"*"*20)    
                 if dev_bleu>best_bleu:
@@ -528,7 +434,7 @@ def main():
                     f1.write(str(gold.idx)+'\t'+' '.join(gold.target)+'\n')
             (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "test_{}.gold".format(idx))) 
-            dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
+            dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0], 2)
             logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
             logger.info("  "+"*"*20)