(add) patch ids embedding roberta model

graykode
Commit ad248582abe296e2f15198a92d8d9d093ae6e47a ad248582 1 parent 3d6b29a6
Showing 3 changed files with 297 additions and 32 deletions
code2nl/customized_roberta.py
code2nl/model.py
code2nl/run.py
--- a/code2nl/customized_roberta.py 0 → 100644
View file @ad24858
+++ b/code2nl/customized_roberta.py 0 → 100644
View file @ad24858
+ # coding=utf-8
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+ # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """PyTorch RoBERTa model. """
+ 
+ import torch
+ import torch.nn as nn
+ 
+ from transformers.modeling_roberta import (
+     create_position_ids_from_input_ids,
+     RobertaPreTrainedModel,
+     RobertaEncoder,
+     RobertaPooler,
+     BaseModelOutputWithPooling
+ )
+ 
+ class RobertaEmbeddings(nn.Module):
+     """
+     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+     """
+ 
+     # Copied from transformers.modeling_bert.BertEmbeddings.__init__
+     def __init__(self, config):
+         super().__init__()
+         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+         self.patch_type_embeddings = nn.Embedding(3, config.hidden_size)
+ 
+         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+         # any TensorFlow checkpoint file
+         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ 
+         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+         self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+ 
+         # End copy
+         self.padding_idx = config.pad_token_id
+         self.position_embeddings = nn.Embedding(
+             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+         )
+ 
+     def forward(self, input_ids=None, patch_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+         if position_ids is None:
+             if input_ids is not None:
+                 # Create the position ids from the input token ids. Any padded tokens remain padded.
+                 position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
+             else:
+                 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+ 
+         # Copied from transformers.modeling_bert.BertEmbeddings.forward
+         if input_ids is not None:
+             input_shape = input_ids.size()
+         else:
+             input_shape = inputs_embeds.size()[:-1]
+ 
+         seq_length = input_shape[1]
+ 
+         if position_ids is None:
+             position_ids = self.position_ids[:, :seq_length]
+ 
+         if token_type_ids is None:
+             token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+ 
+         if inputs_embeds is None:
+             inputs_embeds = self.word_embeddings(input_ids)
+         position_embeddings = self.position_embeddings(position_ids)
+         token_type_embeddings = self.token_type_embeddings(token_type_ids)
+ 
+         embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+         if patch_ids is not None:
+             patch_type_embeddings = self.patch_type_embeddings(patch_ids)
+             embeddings += patch_type_embeddings
+ 
+         embeddings = self.LayerNorm(embeddings)
+         embeddings = self.dropout(embeddings)
+         return embeddings
+ 
+     def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+         """We are provided embeddings directly. We cannot infer which are padded so just generate
+         sequential position ids.
+ 
+         :param torch.Tensor inputs_embeds:
+         :return torch.Tensor:
+         """
+         input_shape = inputs_embeds.size()[:-1]
+         sequence_length = input_shape[1]
+ 
+         position_ids = torch.arange(
+             self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+         )
+         return position_ids.unsqueeze(0).expand(input_shape)
+ 
+ class RobertaModel(RobertaPreTrainedModel):
+     """
+ 
+     The model can behave as an encoder (with only self-attention) as well
+     as a decoder, in which case a layer of cross-attention is added between
+     the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+     Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+ 
+     To behave as an decoder the model needs to be initialized with the
+     :obj:`is_decoder` argument of the configuration set to :obj:`True`.
+     To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+     argument and :obj:`add_cross_attention` set to :obj:`True`; an
+     :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
+ 
+     .. _`Attention is all you need`:
+         https://arxiv.org/abs/1706.03762
+ 
+     """
+ 
+     authorized_missing_keys = [r"position_ids"]
+ 
+     # Copied from transformers.modeling_bert.BertModel.__init__ with Bert->Roberta
+     def __init__(self, config, add_pooling_layer=True):
+         super().__init__(config)
+         self.config = config
+ 
+         self.embeddings = RobertaEmbeddings(config)
+         self.encoder = RobertaEncoder(config)
+ 
+         self.pooler = RobertaPooler(config) if add_pooling_layer else None
+ 
+         self.init_weights()
+ 
+     def get_input_embeddings(self):
+         return self.embeddings.word_embeddings
+ 
+     def set_input_embeddings(self, value):
+         self.embeddings.word_embeddings = value
+ 
+     def _prune_heads(self, heads_to_prune):
+         """Prunes heads of the model.
+         heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+         See base class PreTrainedModel
+         """
+         for layer, heads in heads_to_prune.items():
+             self.encoder.layer[layer].attention.prune_heads(heads)
+ 
+     # Copied from transformers.modeling_bert.BertModel.forward
+     def forward(
+         self,
+         input_ids=None,
+         patch_ids=None,
+         attention_mask=None,
+         token_type_ids=None,
+         position_ids=None,
+         head_mask=None,
+         inputs_embeds=None,
+         encoder_hidden_states=None,
+         encoder_attention_mask=None,
+         output_attentions=None,
+         output_hidden_states=None,
+         return_dict=None,
+     ):
+         r"""
+         encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+             if the model is configured as a decoder.
+         encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+             Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+             is used in the cross-attention if the model is configured as a decoder.
+             Mask values selected in ``[0, 1]``:
+             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+         """
+         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+         output_hidden_states = (
+             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+         )
+         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ 
+         if input_ids is not None and inputs_embeds is not None:
+             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+         elif input_ids is not None:
+             input_shape = input_ids.size()
+         elif inputs_embeds is not None:
+             input_shape = inputs_embeds.size()[:-1]
+         else:
+             raise ValueError("You have to specify either input_ids or inputs_embeds")
+ 
+         device = input_ids.device if input_ids is not None else inputs_embeds.device
+ 
+         if attention_mask is None:
+             attention_mask = torch.ones(input_shape, device=device)
+         if token_type_ids is None:
+             token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+ 
+         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+         # ourselves in which case we just need to make it broadcastable to all heads.
+         extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+ 
+         # If a 2D or 3D attention mask is provided for the cross-attention
+         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+         if self.config.is_decoder and encoder_hidden_states is not None:
+             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+             if encoder_attention_mask is None:
+                 encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+             encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+         else:
+             encoder_extended_attention_mask = None
+ 
+         # Prepare head mask if needed
+         # 1.0 in head_mask indicate we keep the head
+         # attention_probs has shape bsz x n_heads x N x N
+         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+ 
+         embedding_output = self.embeddings(
+             input_ids=input_ids, patch_ids=patch_ids,
+             position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+         )
+         encoder_outputs = self.encoder(
+             embedding_output,
+             attention_mask=extended_attention_mask,
+             head_mask=head_mask,
+             encoder_hidden_states=encoder_hidden_states,
+             encoder_attention_mask=encoder_extended_attention_mask,
+             output_attentions=output_attentions,
+             output_hidden_states=output_hidden_states,
+             return_dict=return_dict,
+         )
+         sequence_output = encoder_outputs[0]
+         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+ 
+         if not return_dict:
+             return (sequence_output, pooled_output) + encoder_outputs[1:]
+ 
+         return BaseModelOutputWithPooling(
+             last_hidden_state=sequence_output,
+             pooler_output=pooled_output,
+             hidden_states=encoder_outputs.hidden_states,
+             attentions=encoder_outputs.attentions,
+         )
\ No newline at end of file
--- a/code2nl/model.py
View file @ad24858
+++ b/code2nl/model.py
View file @ad24858
@@ -51,8 +51,8 @@ class Seq2Seq(nn.Module):
         self._tie_or_clone_weights(self.lm_head,
                                    self.encoder.embeddings.word_embeddings)        
         
-     def forward(self, source_ids=None,source_mask=None,target_ids=None,target_mask=None,args=None):   
-         outputs = self.encoder(source_ids, attention_mask=source_mask)
+     def forward(self, source_ids=None,source_mask=None,target_ids=None,target_mask=None,patch_ids=None,args=None):
+         outputs = self.encoder(source_ids, attention_mask=source_mask, patch_ids=patch_ids)
         encoder_output = outputs[0].permute([1,0,2]).contiguous()
         if target_ids is not None:  
             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
--- a/code2nl/run.py
View file @ad24858
+++ b/code2nl/run.py
View file @ad24858
@@ -35,10 +35,11 @@ from itertools import cycle
 import torch.nn as nn
 from model import Seq2Seq
 from tqdm import tqdm, trange
+ from customized_roberta import RobertaModel
 from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
-                           RobertaConfig, RobertaModel, RobertaTokenizer)
+                           RobertaConfig, RobertaTokenizer)
 MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -50,11 +51,13 @@ class Example(object):
     """A single training/test example."""
     def __init__(self,
                  idx,
-                  source,
+                  added,
+                  deleted,
                  target,
                  ):
         self.idx = idx
-         self.source = source
+         self.added = added
+         self.deleted = deleted
         self.target = target
 
 def read_examples(filename):
@@ -66,16 +69,13 @@ def read_examples(filename):
             js=json.loads(line)
             if 'idx' not in js:
                 js['idx']=idx
-             code=' '.join(js['code_tokens']).replace('\n',' ')
-             code=' '.join(code.strip().split())
-             nl=' '.join(js['docstring_tokens']).replace('\n','')
-             nl=' '.join(nl.strip().split())            
             examples.append(
                 Example(
                         idx = idx,
-                         source=code,
-                         target = nl,
-                         ) 
+                         added=js['added'],
+                         deleted=js['deleted'],
+                         target=js['msg'],
+                         )
             )
     return examples
 
@@ -88,13 +88,15 @@ class InputFeatures(object):
                  target_ids,
                  source_mask,
                  target_mask,
+                  patch_ids,
 
     ):
         self.example_id = example_id
         self.source_ids = source_ids
         self.target_ids = target_ids
         self.source_mask = source_mask
-         self.target_mask = target_mask       
+         self.target_mask = target_mask
+         self.patch_ids = patch_ids
         
 
 
@@ -102,19 +104,26 @@ def convert_examples_to_features(examples, tokenizer, args,stage=None):
     features = []
     for example_index, example in enumerate(examples):
         #source
-         source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-2]
-         source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
-         source_ids =  tokenizer.convert_tokens_to_ids(source_tokens) 
+         added_tokens=[tokenizer.cls_token]+example.added+[tokenizer.sep_token]
+         deleted_tokens=example.deleted+[tokenizer.sep_token]
+         source_tokens = added_tokens + deleted_tokens
+         patch_ids = [1] * len(added_tokens) + [2] * len(deleted_tokens)
+         source_ids =  tokenizer.convert_tokens_to_ids(source_tokens)
         source_mask = [1] * (len(source_tokens))
         padding_length = args.max_source_length - len(source_ids)
         source_ids+=[tokenizer.pad_token_id]*padding_length
+         patch_ids+=[0]*padding_length
         source_mask+=[0]*padding_length
-  
+ 
+         assert len(source_ids) == args.max_source_length
+         assert len(source_mask) == args.max_source_length
+         assert len(patch_ids) == args.max_source_length
+ 
         #target
         if stage=="test":
             target_tokens = tokenizer.tokenize("None")
         else:
-             target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
+             target_tokens = (example.target)[:args.max_target_length-2]
         target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
         target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
         target_mask = [1] *len(target_ids)
@@ -129,6 +138,7 @@ def convert_examples_to_features(examples, tokenizer, args,stage=None):
 
                 logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
                 logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
+                 logger.info("patch_ids: {}".format(' '.join(map(str, patch_ids))))
                 logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
                 
                 logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
@@ -142,6 +152,7 @@ def convert_examples_to_features(examples, tokenizer, args,stage=None):
                  target_ids,
                  source_mask,
                  target_mask,
+                  patch_ids,
             )
         )
     return features
@@ -255,7 +266,7 @@ def main():
     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case)
     
     #budild model
-     encoder = model_class.from_pretrained(args.model_name_or_path,config=config)    
+     encoder = model_class(config=config)
     decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
     decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
     model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
@@ -263,7 +274,7 @@ def main():
                   sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
     if args.load_model_path is not None:
         logger.info("reload model from {}".format(args.load_model_path))
-         model.load_state_dict(torch.load(args.load_model_path))
+         model.load_state_dict(torch.load(args.load_model_path), strict=False)
         
     model.to(device)
     if args.local_rank != -1:
@@ -289,7 +300,8 @@ def main():
         all_source_mask = torch.tensor([f.source_mask for f in train_features], dtype=torch.long)
         all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
         all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long)    
-         train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
+         all_patch_ids = torch.tensor([f.patch_ids for f in train_features], dtype=torch.long)
+         train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask,all_patch_ids)
         
         if args.local_rank == -1:
             train_sampler = RandomSampler(train_data)
@@ -327,8 +339,9 @@ def main():
         for step in bar:
             batch = next(train_dataloader)
             batch = tuple(t.to(device) for t in batch)
-             source_ids,source_mask,target_ids,target_mask = batch
-             loss,_,_ = model(source_ids=source_ids,source_mask=source_mask,target_ids=target_ids,target_mask=target_mask)
+             source_ids,source_mask,target_ids,target_mask,patch_ids = batch
+             loss,_,_ = model(source_ids=source_ids,source_mask=source_mask,
+                              target_ids=target_ids,target_mask=target_mask,patch_ids=patch_ids)
             
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu.
@@ -363,7 +376,8 @@ def main():
                     all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)
                     all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
                     all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long)      
-                     eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)   
+                     all_patch_ids = torch.tensor([f.patch_ids for f in eval_features], dtype=torch.long)
+                     eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask,all_patch_ids)
                     dev_dataset['dev_loss']=eval_examples,eval_data
                 eval_sampler = SequentialSampler(eval_data)
                 eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
@@ -377,11 +391,11 @@ def main():
                 eval_loss,tokens_num = 0,0
                 for batch in eval_dataloader:
                     batch = tuple(t.to(device) for t in batch)
-                     source_ids,source_mask,target_ids,target_mask = batch                  
+                     source_ids,source_mask,target_ids,target_mask,patch_ids = batch
 
                     with torch.no_grad():
                         _,loss,num = model(source_ids=source_ids,source_mask=source_mask,
-                                            target_ids=target_ids,target_mask=target_mask)     
+                                            target_ids=target_ids,target_mask=target_mask,patch_ids=patch_ids)
                     eval_loss += loss.sum().item()
                     tokens_num += num.sum().item()
                 #Pring loss of dev dataset    
@@ -423,7 +437,8 @@ def main():
                     eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
                     all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
                     all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
-                     eval_data = TensorDataset(all_source_ids,all_source_mask)   
+                     all_patch_ids = torch.tensor([f.patch_ids for f in eval_features], dtype=torch.long)
+                     eval_data = TensorDataset(all_source_ids,all_source_mask,all_patch_ids)
                     dev_dataset['dev_bleu']=eval_examples,eval_data
 
 
@@ -435,9 +450,9 @@ def main():
                 p=[]
                 for batch in eval_dataloader:
                     batch = tuple(t.to(device) for t in batch)
-                     source_ids,source_mask= batch                  
+                     source_ids,source_mask,patch_ids= batch
                     with torch.no_grad():
-                         preds = model(source_ids=source_ids,source_mask=source_mask)  
+                         preds = model(source_ids=source_ids,source_mask=source_mask,patch_ids=patch_ids)
                         for pred in preds:
                             t=pred[0].cpu().numpy()
                             t=list(t)
@@ -481,7 +496,8 @@ def main():
             eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
             all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
             all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
-             eval_data = TensorDataset(all_source_ids,all_source_mask)   
+             all_patch_ids = torch.tensor([f.patch_ids for f in eval_features], dtype=torch.long)
+             eval_data = TensorDataset(all_source_ids,all_source_mask,all_patch_ids)
 
             # Calculate bleu
             eval_sampler = SequentialSampler(eval_data)
@@ -491,9 +507,9 @@ def main():
             p=[]
             for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
                 batch = tuple(t.to(device) for t in batch)
-                 source_ids,source_mask= batch                  
+                 source_ids,source_mask,patch_ids= batch
                 with torch.no_grad():
-                     preds = model(source_ids=source_ids,source_mask=source_mask)  
+                     preds = model(source_ids=source_ids,source_mask=source_mask,patch_ids=patch_ids)
                     for pred in preds:
                         t=pred[0].cpu().numpy()
                         t=list(t)