Resnet18 Use data set change

김건
Commit 39d50f91650a14335986049fe4272e54ba10ebd8 39d50f91 1 parent 6e786b83
Showing 26 changed files with 722 additions and 2 deletions
Speaker_Recognition/configure.py
Speaker_Recognition/configure1_merge.py
Speaker_Recognition/configure1_zeroth.py
Speaker_Recognition/enroll1.py
Speaker_Recognition/enroll_embeddings1/103F3021.pth
Speaker_Recognition/enroll_embeddings1/207F2088.pth
Speaker_Recognition/enroll_embeddings1/213F5100.pth
Speaker_Recognition/enroll_embeddings1/217F3038.pth
Speaker_Recognition/enroll_embeddings1/225M4062.pth
Speaker_Recognition/enroll_embeddings1/229M2031.pth
Speaker_Recognition/enroll_embeddings1/230M4087.pth
Speaker_Recognition/enroll_embeddings1/233F4013.pth
Speaker_Recognition/enroll_embeddings1/236M3043.pth
Speaker_Recognition/enroll_embeddings1/240M3063.pth
Speaker_Recognition/enroll_embeddings1/777M7777.pth
Speaker_Recognition/enroll_embeddings1/778M8777.pth
Speaker_Recognition/enroll_embeddings1/779M9777.pth
Speaker_Recognition/enroll_embeddings1/999M9999.pth
Speaker_Recognition/enroll_embeddings1/zerothfloac.pth
Speaker_Recognition/identification.py
--- a/Speaker_Recognition/configure.py
View file @39d50f9
+++ b/Speaker_Recognition/configure.py
View file @39d50f9
@@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs'
 # Feature path
 TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train'
+# TRAIN_FEAT_DIR = '/test/merge_dataset'
+# TRAIN_FEAT_DIR = '/test/trainFeature'
 TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
 # Context window size
--- a/Speaker_Recognition/configure1_merge.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/configure1_merge.py 0 → 100644
View file @39d50f9
+# Wave path
+TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
+DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
+TEST_WAV_DIR = 'test_wavs'
+
+# Feature path
+# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003'
+TRAIN_FEAT_DIR = '/test/merge_train_dataset'
+# TRAIN_FEAT_DIR = '/test/trainFeature'
+# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
+TEST_FEAT_DIR = '/test/merge_test_dataset'
+# Context window size
+NUM_WIN_SIZE = 100 #10
+
+# Settings for feature extraction
+USE_LOGSCALE = True
+USE_DELTA = False
+USE_SCALE = False
+SAMPLE_RATE = 16000
+FILTER_BANK = 40
\ No newline at end of file
--- a/Speaker_Recognition/configure1_zeroth.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/configure1_zeroth.py 0 → 100644
View file @39d50f9
+# Wave path
+TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
+DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
+TEST_WAV_DIR = 'test_wavs'
+
+# Feature path
+# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003'
+TRAIN_FEAT_DIR = '/test/zeroth_train_dataset'
+# TRAIN_FEAT_DIR = '/test/trainFeature'
+# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
+TEST_FEAT_DIR = '/test/zeroth_test_dataset'
+# Context window size
+NUM_WIN_SIZE = 100 #10
+
+# Settings for feature extraction
+USE_LOGSCALE = True
+USE_DELTA = False
+USE_SCALE = False
+SAMPLE_RATE = 16000
+FILTER_BANK = 40
\ No newline at end of file
--- a/Speaker_Recognition/enroll1.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll1.py 0 → 100644
View file @39d50f9
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model1 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
+    """
+    Output the averaged d-vector for each speaker (enrollment)
+    Return the dictionary (length of n_spk)
+    """
+    n_files = len(DB) # 10
+    enroll_speaker_list = sorted(set(DB['speaker_id']))
+    
+    embeddings = {}
+    
+    # Aggregates all the activations
+    print("Start to aggregate all the d-vectors per enroll speaker")
+    
+    for i in range(n_files):
+        filename = DB['filename'][i]
+        spk = DB['speaker_id'][i]
+        
+        activation = get_embeddings(use_cuda, filename, model, test_frames)
+        if spk in embeddings:
+            embeddings[spk] += activation
+        else:
+            embeddings[spk] = activation
+            
+        print("Aggregates the activation (spk : %s)" % (spk))
+        
+    if not os.path.exists(embedding_dir):
+        os.makedirs(embedding_dir)
+        
+    # Save the embeddings
+    for spk_index in enroll_speaker_list:
+        embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
+        torch.save(embeddings[spk_index], embedding_path)
+        print("Save the embeddings for %s" % (spk_index))
+    return embeddings
+    
+def main():
+        
+    # Settings
+    use_cuda = True
+    log_dir = 'new_model1'
+    embedding_size = 128
+    cp_num = 24 # Which checkpoint to use?
+    n_classes = 241
+    test_frames = 200
+    
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+    
+    # Get the dataframe for enroll DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Where to save embeddings
+    embedding_dir = 'enroll_embeddings1'
+    
+    # Perform the enrollment and save the results
+    enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/Speaker_Recognition/enroll_embeddings1/103F3021.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/103F3021.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/207F2088.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/207F2088.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/213F5100.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/213F5100.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/217F3038.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/217F3038.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/225M4062.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/225M4062.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/229M2031.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/229M2031.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/230M4087.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/230M4087.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/233F4013.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/233F4013.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/236M3043.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/236M3043.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/240M3063.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/240M3063.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/777M7777.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/777M7777.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/778M8777.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/778M8777.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/779M9777.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/779M9777.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/999M9999.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/999M9999.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/enroll_embeddings1/zerothfloac.pth 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/enroll_embeddings1/zerothfloac.pth 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/identification.py
View file @39d50f9
+++ b/Speaker_Recognition/identification.py
View file @39d50f9
@@ -123,10 +123,10 @@ def main():
     """ 
     spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
-    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063']
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
     # Set the test speaker
-    test_speaker = '230M4087' 
+    test_speaker = '778M8777' 
     test_path = os.path.join(test_dir, test_speaker, 'test.p')
--- a/Speaker_Recognition/identification1.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/identification1.py 0 → 100644
View file @39d50f9
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model1 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+    max_score = -10**8
+    best_spk = None
+    for spk in spk_list:
+        score = F.cosine_similarity(test_embedding, embeddings[spk])
+        score = score.data.cpu().numpy() 
+        if score > max_score:
+            max_score = score
+            best_spk = spk
+    #print("Speaker identification result : %s" %best_spk)
+    true_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker identification ===")
+    print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+    return best_spk
+
+def main():
+    
+    log_dir = 'new_model1' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings1' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 30 # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
+    
+    # Set the test speaker
+    test_speaker = '213F5100' 
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/model/model1.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/model/model1.py 0 → 100644
View file @39d50f9
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+import model.resnet as resnet
+
+
+class background_resnet(nn.Module):
+    def __init__(self, embedding_size, num_classes, backbone='resnet18'):
+        super(background_resnet, self).__init__()
+        self.backbone = backbone
+        # copying modules from pretrained models
+        if backbone == 'resnet50':
+            self.pretrained = resnet.resnet50(pretrained=False)
+        elif backbone == 'resnet101':
+            self.pretrained = resnet.resnet101(pretrained=False)
+        elif backbone == 'resnet152':
+            self.pretrained = resnet.resnet152(pretrained=False)
+        elif backbone == 'resnet18':
+            self.pretrained = resnet.resnet18(pretrained=False)
+        elif backbone == 'resnet34':
+            self.pretrained = resnet.resnet34(pretrained=False)
+        else:
+            raise RuntimeError('unknown backbone: {}'.format(backbone))
+            
+        self.fc0 = nn.Linear(128, embedding_size)
+        self.bn0 = nn.BatchNorm1d(embedding_size)
+        self.relu = nn.ReLU()
+        self.last = nn.Linear(embedding_size, num_classes)
+
+    def forward(self, x):
+        # input x: minibatch x 1 x 40 x 40
+        x = self.pretrained.conv1(x)
+        x = self.pretrained.bn1(x)
+        x = self.pretrained.relu(x)
+        
+        x = self.pretrained.layer1(x)
+        x = self.pretrained.layer2(x)
+        x = self.pretrained.layer3(x)
+        x = self.pretrained.layer4(x)
+        
+        out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+        out = torch.squeeze(out) # [batch, n_embed]
+        # flatten the out so that the fully connected layer can be connected from here
+        out = out.view(x.size(0), -1) # (n_batch, n_embed)
+        spk_embedding = self.fc0(out)
+        out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+        out = self.last(out)
+        
+        return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/model/resnet.py
View file @39d50f9
+++ b/Speaker_Recognition/model/resnet.py
View file @39d50f9
@@ -113,6 +113,7 @@ class ResNet(nn.Module):
         self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
         self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
         self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
+
         self.avgpool = nn.AvgPool2d(1, stride=1)
         self.fc = nn.Linear(128 * block.expansion, num_classes)
--- a/Speaker_Recognition/model/resnet1.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/model/resnet1.py 0 → 100644
View file @39d50f9
+"""Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+and added support for the 1x32x32 mel spectrogram for the speech recognition.
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition
+https://arxiv.org/abs/1512.03385
+"""
+
+import torch.nn as nn
+import math
+import torch.utils.model_zoo as model_zoo
+
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152']
+
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000, in_channels=1):
+        self.inplanes = 16
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3,
+                               bias=False) # ori : stride = 2
+        self.bn1 = nn.BatchNorm2d(16)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 16, layers[0])
+        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
+        self.layer5 = self._make_layer(block, 256, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(1, stride=1)
+        self.fc = nn.Linear(128 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
+    return model
+
+
+def resnet34(pretrained=False, **kwargs):
+    """Constructs a ResNet-34 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
+    return model
+
+
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
+    return model
+
+
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
+    return model
+
+
+def resnet152(pretrained=False, **kwargs):
+    """Constructs a ResNet-152 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
+    return model
\ No newline at end of file
--- a/Speaker_Recognition/train1.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/train1.py 0 → 100644
View file @39d50f9
--- a/Speaker_Recognition/verification1.py 0 → 100644
View file @39d50f9
+++ b/Speaker_Recognition/verification1.py 0 → 100644
View file @39d50f9
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model1 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+    enroll_embedding = embeddings[enroll_speaker]
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+
+    score = F.cosine_similarity(test_embedding, enroll_embedding)
+    score = score.data.cpu().numpy() 
+        
+    if score > thres:
+        result = 'Accept'
+    else:
+        result = 'Reject'
+        
+    test_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker verification ===")
+    print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+    print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+
+def main():
+    
+    log_dir = 'new_model1' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings1' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 29  # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+    
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    # Set the true speaker
+    enroll_speaker = 'zerothfloac'
+    
+    # Set the claimed speaker
+    test_speaker = 'zerothfloac' 
+    
+    # Threshold
+    thres = 0.95
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+
+if __name__ == '__main__':
+    main()