(fixed) remove max_source_length, add diff_parse function

graykode
Commit ce932c6884ef6673302bcf864500b3e597d18968 ce932c68 1 parent 9f8021c1
Showing 1 changed file with 19 additions and 15 deletions
commit_suggester.py
--- a/commit_suggester.py
View file @ce932c6
+++ b/commit_suggester.py
View file @ce932c6
@@ -20,18 +20,25 @@ from transformers import AutoTokenizer
 from preprocess import diff_parse, truncate
 from train import BartForConditionalGeneration
+def get_length(chunks):
+    cnt = 0
+    for chunk in chunks:
+        cnt += len(chunk)
+    return cnt
+
+def suggester(chunks, model, tokenizer, device):
+    max_source_length = get_length(chunks)
-def suggester(chunks, max_source_length, model, tokenizer, device):
     input_ids, attention_masks, patch_ids = zip(*chunks)
-    input_ids = torch.LongTensor([truncate(input_ids, max_source_length, value=0)]).to(
+    input_ids = torch.LongTensor(
-        device
+        [truncate(input_ids, max_source_length, value=0)]
-    )
+    ).to(device)
     attention_masks = torch.LongTensor(
         [truncate(attention_masks, max_source_length, value=1)]
     ).to(device)
-    patch_ids = torch.LongTensor([truncate(patch_ids, max_source_length, value=0)]).to(
+    patch_ids = torch.LongTensor(
-        device
+        [truncate(patch_ids, max_source_length, value=0)]
-    )
+    ).to(device)
     summaries = model.generate(
         input_ids=input_ids, patch_ids=patch_ids, attention_mask=attention_masks
@@ -59,9 +66,13 @@ def main(args):
         staged_files = [f.strip() for f in staged_files]
         chunks = "\n".join(staged_files)
+    chunks = diff_parse(chunks, tokenizer)
+    if not chunks:
+        print('There is no file in staged state.')
+        return
+
     commit_message = suggester(
         chunks,
-        max_source_length=args.max_source_length,
         model=model,
         tokenizer=tokenizer,
         device=device,
@@ -89,13 +100,6 @@ if __name__ == "__main__":
         type=str,
         help="Pretrained tokenizer name or path if not the same as model_name",
     )
-    parser.add_argument(
-        "--max_source_length",
-        default=1024,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
     args = parser.parse_args()
     main(args)