김건

Resnet34 + Layer ver, Resnet50 ver commit

Speaker_Recognition @ df38711f
1 +Subproject commit df38711f36cfb15ee578d14a70d0141d1d0a8134
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model3 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model3' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 11 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '233F4013'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model4' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 25 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '207F2088'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model5 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model5' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 30 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '207F2088'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet1 as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet18'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(256, embedding_size)
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 + x = self.pretrained.layer1(x)
37 + x = self.pretrained.layer2(x)
38 + x = self.pretrained.layer3(x)
39 + x = self.pretrained.layer4(x)
40 + x = self.pretrained.layer5(x)
41 +
42 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
43 + out = torch.squeeze(out) # [batch, n_embed]
44 + # flatten the out so that the fully connected layer can be connected from here
45 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
46 + spk_embedding = self.fc0(out)
47 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
48 + out = self.last(out)
49 +
50 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet1 as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet34'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(256, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 + x = self.pretrained.layer1(x)
37 + x = self.pretrained.layer2(x)
38 + x = self.pretrained.layer3(x)
39 + x = self.pretrained.layer4(x)
40 + x = self.pretrained.layer5(x)
41 +
42 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
43 + out = torch.squeeze(out) # [batch, n_embed]
44 + # flatten the out so that the fully connected layer can be connected from here
45 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
46 + spk_embedding = self.fc0(out)
47 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
48 + out = self.last(out)
49 +
50 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet1 as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet50'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(512, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 + x = self.pretrained.layer1(x)
37 + x = self.pretrained.layer2(x)
38 + x = self.pretrained.layer3(x)
39 + x = self.pretrained.layer4(x)
40 +
41 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
42 + out = torch.squeeze(out) # [batch, n_embed]
43 + # flatten the out so that the fully connected layer can be connected from here
44 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
45 + spk_embedding = self.fc0(out)
46 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
47 + out = self.last(out)
48 +
49 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model3 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model3' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 11 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '103F3021'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '207F2088'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model4' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 25 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '229M2031'
128 +
129 + # Set the claimed speaker
130 + test_speaker = 'sunghwan1'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model4_merge' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings4_merge' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 50 # Which checkpoint to use?
109 + n_classes = 348 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '213F5100'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '207F2088'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model4_zeroth' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings4_zeroth' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 30 # Which checkpoint to use?
109 + n_classes = 105 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '777M7777'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '103F3021'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model5 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model5' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 30 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '777M7777'
128 +
129 + # Set the claimed speaker
130 + test_speaker = 'sunghwan1'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()