Resnet34 + Layer ver, Resnet50 ver commit

김건
Commit cdaaea5f1b07e06a81f5880da426020862f75578 cdaaea5f 1 parent 80274f58
Showing 17 changed files with 1266 additions and 0 deletions
Speaker_Recognition/Speaker_Recognition
Speaker_Recognition/identification3.py
Speaker_Recognition/identification4.py
Speaker_Recognition/identification5.py
Speaker_Recognition/model/model3.py
Speaker_Recognition/model/model4.py
Speaker_Recognition/model/model5.py
Speaker_Recognition/train3.py
Speaker_Recognition/train4.py
Speaker_Recognition/train4_merge.py
Speaker_Recognition/train4_zeroth.py
Speaker_Recognition/train5.py
Speaker_Recognition/verification3.py
Speaker_Recognition/verification4.py
Speaker_Recognition/verification4_merge.py
Speaker_Recognition/verification4_zeroth.py
Speaker_Recognition/verification5.py
--- a/Speaker_Recognition @ df38711f
+++ b/Speaker_Recognition @ df38711f
+ Subproject commit df38711f36cfb15ee578d14a70d0141d1d0a8134
--- a/Speaker_Recognition/identification3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/identification3.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model3 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+     max_score = -10**8
+     best_spk = None
+     for spk in spk_list:
+         score = F.cosine_similarity(test_embedding, embeddings[spk])
+         score = score.data.cpu().numpy() 
+         if score > max_score:
+             max_score = score
+             best_spk = spk
+     #print("Speaker identification result : %s" %best_spk)
+     true_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker identification ===")
+     print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+     return best_spk
+ 
+ def main():
+     
+     log_dir = 'new_model3' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 11 # Which checkpoint to use?
+     n_classes = 241 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+ 
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
+     
+     # Set the test speaker
+     test_speaker = '233F4013' 
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+ 
+ if __name__ == '__main__':
+     main()
--- a/Speaker_Recognition/identification4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/identification4.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model4 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+     max_score = -10**8
+     best_spk = None
+     for spk in spk_list:
+         score = F.cosine_similarity(test_embedding, embeddings[spk])
+         score = score.data.cpu().numpy() 
+         if score > max_score:
+             max_score = score
+             best_spk = spk
+     #print("Speaker identification result : %s" %best_spk)
+     true_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker identification ===")
+     print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+     return best_spk
+ 
+ def main():
+     
+     log_dir = 'new_model4' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 25 # Which checkpoint to use?
+     n_classes = 241 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+ 
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
+     
+     # Set the test speaker
+     test_speaker = '207F2088' 
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+ 
+ if __name__ == '__main__':
+     main()
--- a/Speaker_Recognition/identification5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/identification5.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model5 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+     max_score = -10**8
+     best_spk = None
+     for spk in spk_list:
+         score = F.cosine_similarity(test_embedding, embeddings[spk])
+         score = score.data.cpu().numpy() 
+         if score > max_score:
+             max_score = score
+             best_spk = spk
+     #print("Speaker identification result : %s" %best_spk)
+     true_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker identification ===")
+     print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+     return best_spk
+ 
+ def main():
+     
+     log_dir = 'new_model5' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 30  # Which checkpoint to use?
+     n_classes = 241 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+ 
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
+     
+     # Set the test speaker
+     test_speaker = '207F2088' 
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+ 
+ if __name__ == '__main__':
+     main()
--- a/Speaker_Recognition/model/model3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/model/model3.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ from torch.autograd import Function
+ import model.resnet1 as resnet
+ 
+ 
+ class background_resnet(nn.Module):
+     def __init__(self, embedding_size, num_classes, backbone='resnet18'):
+         super(background_resnet, self).__init__()
+         self.backbone = backbone
+         # copying modules from pretrained models
+         if backbone == 'resnet50':
+             self.pretrained = resnet.resnet50(pretrained=False)
+         elif backbone == 'resnet101':
+             self.pretrained = resnet.resnet101(pretrained=False)
+         elif backbone == 'resnet152':
+             self.pretrained = resnet.resnet152(pretrained=False)
+         elif backbone == 'resnet18':
+             self.pretrained = resnet.resnet18(pretrained=False)
+         elif backbone == 'resnet34':
+             self.pretrained = resnet.resnet34(pretrained=False)
+         else:
+             raise RuntimeError('unknown backbone: {}'.format(backbone))
+             
+         self.fc0 = nn.Linear(256, embedding_size)
+         self.bn0 = nn.BatchNorm1d(embedding_size)
+         self.relu = nn.ReLU()
+         self.last = nn.Linear(embedding_size, num_classes)
+ 
+     def forward(self, x):
+         # input x: minibatch x 1 x 40 x 40
+         x = self.pretrained.conv1(x)
+         x = self.pretrained.bn1(x)
+         x = self.pretrained.relu(x)
+         x = self.pretrained.layer1(x)
+         x = self.pretrained.layer2(x)
+         x = self.pretrained.layer3(x)
+         x = self.pretrained.layer4(x)
+         x = self.pretrained.layer5(x)
+ 
+         out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+         out = torch.squeeze(out) # [batch, n_embed]
+         # flatten the out so that the fully connected layer can be connected from here
+         out = out.view(x.size(0), -1) # (n_batch, n_embed)
+         spk_embedding = self.fc0(out)
+         out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+         out = self.last(out)
+         
+         return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/model/model4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/model/model4.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ from torch.autograd import Function
+ import model.resnet1 as resnet
+ 
+ 
+ class background_resnet(nn.Module):
+     def __init__(self, embedding_size, num_classes, backbone='resnet34'):
+         super(background_resnet, self).__init__()
+         self.backbone = backbone
+         # copying modules from pretrained models
+         if backbone == 'resnet50':
+             self.pretrained = resnet.resnet50(pretrained=False)
+         elif backbone == 'resnet101':
+             self.pretrained = resnet.resnet101(pretrained=False)
+         elif backbone == 'resnet152':
+             self.pretrained = resnet.resnet152(pretrained=False)
+         elif backbone == 'resnet18':
+             self.pretrained = resnet.resnet18(pretrained=False)
+         elif backbone == 'resnet34':
+             self.pretrained = resnet.resnet34(pretrained=False)
+         else:
+             raise RuntimeError('unknown backbone: {}'.format(backbone))
+             
+         self.fc0 = nn.Linear(256, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
+         self.bn0 = nn.BatchNorm1d(embedding_size)
+         self.relu = nn.ReLU()
+         self.last = nn.Linear(embedding_size, num_classes)
+ 
+     def forward(self, x):
+         # input x: minibatch x 1 x 40 x 40
+         x = self.pretrained.conv1(x)
+         x = self.pretrained.bn1(x)
+         x = self.pretrained.relu(x)
+         x = self.pretrained.layer1(x)
+         x = self.pretrained.layer2(x)
+         x = self.pretrained.layer3(x)
+         x = self.pretrained.layer4(x)
+         x = self.pretrained.layer5(x)
+ 
+         out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+         out = torch.squeeze(out) # [batch, n_embed]
+         # flatten the out so that the fully connected layer can be connected from here
+         out = out.view(x.size(0), -1) # (n_batch, n_embed)
+         spk_embedding = self.fc0(out)
+         out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+         out = self.last(out)
+         
+         return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/model/model5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/model/model5.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ from torch.autograd import Function
+ import model.resnet1 as resnet
+ 
+ 
+ class background_resnet(nn.Module):
+     def __init__(self, embedding_size, num_classes, backbone='resnet50'):
+         super(background_resnet, self).__init__()
+         self.backbone = backbone
+         # copying modules from pretrained models
+         if backbone == 'resnet50':
+             self.pretrained = resnet.resnet50(pretrained=False)
+         elif backbone == 'resnet101':
+             self.pretrained = resnet.resnet101(pretrained=False)
+         elif backbone == 'resnet152':
+             self.pretrained = resnet.resnet152(pretrained=False)
+         elif backbone == 'resnet18':
+             self.pretrained = resnet.resnet18(pretrained=False)
+         elif backbone == 'resnet34':
+             self.pretrained = resnet.resnet34(pretrained=False)
+         else:
+             raise RuntimeError('unknown backbone: {}'.format(backbone))
+             
+         self.fc0 = nn.Linear(512, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
+         self.bn0 = nn.BatchNorm1d(embedding_size)
+         self.relu = nn.ReLU()
+         self.last = nn.Linear(embedding_size, num_classes)
+ 
+     def forward(self, x):
+         # input x: minibatch x 1 x 40 x 40
+         x = self.pretrained.conv1(x)
+         x = self.pretrained.bn1(x)
+         x = self.pretrained.relu(x)
+         x = self.pretrained.layer1(x)
+         x = self.pretrained.layer2(x)
+         x = self.pretrained.layer3(x)
+         x = self.pretrained.layer4(x)
+ 
+         out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+         out = torch.squeeze(out) # [batch, n_embed]
+         # flatten the out so that the fully connected layer can be connected from here
+         out = out.view(x.size(0), -1) # (n_batch, n_embed)
+         spk_embedding = self.fc0(out)
+         out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+         out = self.last(out)
+         
+         return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/train3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train3.py 0 → 100644
View file @cdaaea5
--- a/Speaker_Recognition/train4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train4.py 0 → 100644
View file @cdaaea5
--- a/Speaker_Recognition/train4_merge.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train4_merge.py 0 → 100644
View file @cdaaea5
--- a/Speaker_Recognition/train4_zeroth.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train4_zeroth.py 0 → 100644
View file @cdaaea5
--- a/Speaker_Recognition/train5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train5.py 0 → 100644
View file @cdaaea5
--- a/Speaker_Recognition/verification3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification3.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model3 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+     enroll_embedding = embeddings[enroll_speaker]
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+ 
+     score = F.cosine_similarity(test_embedding, enroll_embedding)
+     score = score.data.cpu().numpy() 
+         
+     if score > thres:
+         result = 'Accept'
+     else:
+         result = 'Reject'
+         
+     test_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker verification ===")
+     print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+     print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+ 
+ def main():
+     
+     log_dir = 'new_model3' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 11  # Which checkpoint to use?
+     n_classes = 241 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+     
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     # Set the true speaker
+     enroll_speaker = '103F3021'
+     
+     # Set the claimed speaker
+     test_speaker = '207F2088' 
+     
+     # Threshold
+     thres = 0.95
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+ 
+ if __name__ == '__main__':
+     main()
--- a/Speaker_Recognition/verification4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification4.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model4 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+     enroll_embedding = embeddings[enroll_speaker]
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+ 
+     score = F.cosine_similarity(test_embedding, enroll_embedding)
+     score = score.data.cpu().numpy() 
+         
+     if score > thres:
+         result = 'Accept'
+     else:
+         result = 'Reject'
+         
+     test_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker verification ===")
+     print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+     print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+ 
+ def main():
+     
+     log_dir = 'new_model4' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 25   # Which checkpoint to use?
+     n_classes = 241 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+     
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     # Set the true speaker
+     enroll_speaker = '229M2031'
+     
+     # Set the claimed speaker
+     test_speaker = 'sunghwan1' 
+     
+     # Threshold
+     thres = 0.95
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+ 
+ if __name__ == '__main__':
+     main()
--- a/Speaker_Recognition/verification4_merge.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification4_merge.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model4 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+     enroll_embedding = embeddings[enroll_speaker]
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+ 
+     score = F.cosine_similarity(test_embedding, enroll_embedding)
+     score = score.data.cpu().numpy() 
+         
+     if score > thres:
+         result = 'Accept'
+     else:
+         result = 'Reject'
+         
+     test_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker verification ===")
+     print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+     print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+ 
+ def main():
+     
+     log_dir = 'new_model4_merge' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings4_merge' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 50   # Which checkpoint to use?
+     n_classes = 348 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+     
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     # Set the true speaker
+     enroll_speaker = '213F5100'
+ 
+     # Set the claimed speaker
+     test_speaker = '207F2088' 
+     
+     # Threshold
+     thres = 0.95
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+ 
+ if __name__ == '__main__':
+     main()
--- a/Speaker_Recognition/verification4_zeroth.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification4_zeroth.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model4 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+     enroll_embedding = embeddings[enroll_speaker]
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+ 
+     score = F.cosine_similarity(test_embedding, enroll_embedding)
+     score = score.data.cpu().numpy() 
+         
+     if score > thres:
+         result = 'Accept'
+     else:
+         result = 'Reject'
+         
+     test_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker verification ===")
+     print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+     print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+ 
+ def main():
+     
+     log_dir = 'new_model4_zeroth' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings4_zeroth' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 30   # Which checkpoint to use?
+     n_classes = 105 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+     
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     # Set the true speaker
+     enroll_speaker = '777M7777'
+     
+     # Set the claimed speaker
+     test_speaker = '103F3021' 
+     
+     # Threshold
+     thres = 0.95
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+ 
+ if __name__ == '__main__':
+     main()
--- a/Speaker_Recognition/verification5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification5.py 0 → 100644
View file @cdaaea5
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model5 import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+     enroll_embedding = embeddings[enroll_speaker]
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+ 
+     score = F.cosine_similarity(test_embedding, enroll_embedding)
+     score = score.data.cpu().numpy() 
+         
+     if score > thres:
+         result = 'Accept'
+     else:
+         result = 'Reject'
+         
+     test_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker verification ===")
+     print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+     print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+ 
+ def main():
+     
+     log_dir = 'new_model5' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 30   # Which checkpoint to use?
+     n_classes = 241 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+     
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     # Set the true speaker
+     enroll_speaker = '777M7777'
+     
+     # Set the claimed speaker
+     test_speaker = 'sunghwan1' 
+     
+     # Threshold
+     thres = 0.95
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+ 
+ if __name__ == '__main__':
+     main()