김건

Speaker Recognition V1 resnet18

1 +"""
2 +Modification of the function 'DBspeech_wav_reader.py' of the deep-speaker created by philipperemy
3 +Working on python 3
4 +Input : DB path
5 +Output : 1) Make DB structure using pd.DataFrame which has 3 columns (file id, file path, speaker id, DB id)
6 + => 'read_DB_structure' function
7 + 2) Read a wav file from DB structure
8 + => 'read_audio' function
9 +"""
10 +import logging
11 +import os
12 +from glob import glob
13 +
14 +import librosa
15 +import numpy as np
16 +import pandas as pd
17 +
18 +from configure import SAMPLE_RATE
19 +
20 +np.set_printoptions(threshold=np.nan)
21 +pd.set_option('display.max_rows', 500)
22 +pd.set_option('display.max_columns', 500)
23 +pd.set_option('display.width', 1000)
24 +pd.set_option('max_colwidth', 100)
25 +
26 +
27 +def find_wavs(directory, pattern='**/*.wav'):
28 + """Recursively finds all files matching the pattern."""
29 + return glob(os.path.join(directory, pattern), recursive=True)
30 +
31 +def find_feats(directory, pattern='**/*.p'):
32 + """Recursively finds all files matching the pattern."""
33 + return glob(os.path.join(directory, pattern), recursive=True)
34 +
35 +def read_audio(filename, sample_rate=SAMPLE_RATE):
36 + audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
37 + audio = audio.flatten()
38 + return audio
39 +
40 +def read_DB_structure(directory):
41 + DB = pd.DataFrame()
42 + DB['filename'] = find_wavs(directory) # filename
43 + DB['filename'] = DB['filename'].apply(lambda x: x.replace('\\', '/')) # normalize windows paths
44 + DB['speaker_id'] = DB['filename'].apply(lambda x: x.split('/')[-2]) # speaker folder name
45 + DB['dataset_id'] = DB['filename'].apply(lambda x: x.split('/')[-3]) # dataset folder name
46 + num_speakers = len(DB['speaker_id'].unique())
47 + logging.info('Found {} files with {} different speakers.'.format(str(len(DB)).zfill(7), str(num_speakers).zfill(5)))
48 + logging.info(DB.head(10))
49 + return DB
50 +
51 +def read_feats_structure(directory):
52 + DB = pd.DataFrame()
53 + DB['filename'] = find_feats(directory) # filename
54 + DB['filename'] = DB['filename'].apply(lambda x: x.replace('\\', '/')) # normalize windows paths
55 + DB['speaker_id'] = DB['filename'].apply(lambda x: x.split('/')[-2]) # speaker folder name
56 + DB['dataset_id'] = DB['filename'].apply(lambda x: x.split('/')[-3]) # dataset folder name
57 + num_speakers = len(DB['speaker_id'].unique())
58 + logging.info('Found {} files with {} different speakers.'.format(str(len(DB)).zfill(7), str(num_speakers).zfill(5)))
59 + logging.info(DB.head(10))
60 + return DB
61 +
62 +def test():
63 + DB_dir = '/home/administrator/Desktop/DB/Speaker_robot_train_DB'
64 + DB = read_DB_structure(DB_dir)
65 + test_wav = read_audio(DB[0:1]['filename'].values[0])
66 + return DB, test_wav
67 +
68 +
69 +if __name__ == '__main__':
70 + DB, test_wav = test()
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.utils.data as data
3 +import torchvision.transforms as transforms
4 +import random
5 +import os
6 +import pickle # For python3
7 +import numpy as np
8 +import configure as c
9 +from DB_wav_reader import read_DB_structure
10 +
11 +def read_MFB(filename):
12 + with open(filename, 'rb') as f:
13 + feat_and_label = pickle.load(f)
14 +
15 + feature = feat_and_label['feat'] # size : (n_frames, dim=40)
16 + label = feat_and_label['label']
17 + """
18 + VAD
19 + """
20 + start_sec, end_sec = 0.5, 0.5
21 + start_frame = int(start_sec / 0.01)
22 + end_frame = len(feature) - int(end_sec / 0.01)
23 + ori_feat = feature
24 + feature = feature[start_frame:end_frame,:]
25 + assert len(feature) > 40, (
26 + 'length is too short. len:%s, ori_len:%s, file:%s' % (len(feature), len(ori_feat), filename))
27 + return feature, label
28 +
29 +class TruncatedInputfromMFB(object):
30 + """
31 + input size : (n_frames, dim=40)
32 + output size : (1, n_win=40, dim=40) => one context window is chosen randomly
33 + """
34 + def __init__(self, input_per_file=1):
35 + super(TruncatedInputfromMFB, self).__init__()
36 + self.input_per_file = input_per_file
37 +
38 + def __call__(self, frames_features):
39 + network_inputs = []
40 + num_frames = len(frames_features)
41 +
42 + win_size = c.NUM_WIN_SIZE
43 + half_win_size = int(win_size/2)
44 + #if num_frames - half_win_size < half_win_size:
45 + while num_frames - half_win_size <= half_win_size:
46 + frames_features = np.append(frames_features, frames_features[:num_frames,:], axis=0)
47 + num_frames = len(frames_features)
48 +
49 + for i in range(self.input_per_file):
50 + j = random.randrange(half_win_size, num_frames - half_win_size)
51 + if not j:
52 + frames_slice = np.zeros(num_frames, c.FILTER_BANK, 'float64')
53 + frames_slice[0:(frames_features.shape)[0]] = frames_features.shape
54 + else:
55 + frames_slice = frames_features[j - half_win_size:j + half_win_size]
56 + network_inputs.append(frames_slice)
57 + return np.array(network_inputs)
58 +
59 +
60 +class TruncatedInputfromMFB_test(object):
61 + def __init__(self, input_per_file=1):
62 + super(TruncatedInputfromMFB_test, self).__init__()
63 + self.input_per_file = input_per_file
64 +
65 + def __call__(self, frames_features):
66 + network_inputs = []
67 + num_frames = len(frames_features)
68 +
69 + for i in range(self.input_per_file):
70 +
71 + for j in range(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME):
72 + frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME]
73 + # network_inputs.append(np.reshape(frames_slice, (32, 20, 3)))
74 + network_inputs.append(frames_slice)
75 + return np.array(network_inputs)
76 +
77 +class TruncatedInputfromMFB_CNN_test(object):
78 + def __init__(self, input_per_file=1):
79 + super(TruncatedInputfromMFB_CNN_test, self).__init__()
80 + self.input_per_file = input_per_file
81 +
82 + def __call__(self, frames_features):
83 + network_inputs = []
84 + num_frames = len(frames_features)
85 +
86 + for i in range(self.input_per_file):
87 +
88 + for j in range(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME):
89 + frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME]
90 + #network_inputs.append(np.reshape(frames_slice, (-1, c.NUM_PREVIOUS_FRAME+c.NUM_NEXT_FRAME, c.FILTER_BANK)))
91 + network_inputs.append(frames_slice)
92 + network_inputs = np.expand_dims(network_inputs, axis=1)
93 + assert network_inputs.ndim == 4, 'Data is not a 4D tensor. size:%s' % (np.shape(network_inputs),)
94 + return np.array(network_inputs)
95 +
96 +class ToTensorInput(object):
97 + """Convert ndarrays in sample to Tensors."""
98 + def __call__(self, np_feature):
99 + """
100 + Args:
101 + feature (numpy.ndarray): feature to be converted to tensor.
102 + Returns:
103 + Tensor: Converted feature.
104 + """
105 + if isinstance(np_feature, np.ndarray):
106 + # handle numpy array
107 + ten_feature = torch.from_numpy(np_feature.transpose((0,2,1))).float() # output type => torch.FloatTensor, fast
108 +
109 + # input size : (1, n_win=200, dim=40)
110 + # output size : (1, dim=40, n_win=200)
111 + return ten_feature
112 +
113 +class ToTensorDevInput(object):
114 + """Convert ndarrays in sample to Tensors."""
115 + def __call__(self, np_feature):
116 + """
117 + Args:
118 + feature (numpy.ndarray): feature to be converted to tensor.
119 + Returns:
120 + Tensor: Converted feature.
121 + """
122 + if isinstance(np_feature, np.ndarray):
123 + # handle numpy array
124 + np_feature = np.expand_dims(np_feature, axis=0)
125 + assert np_feature.ndim == 3, 'Data is not a 3D tensor. size:%s' %(np.shape(np_feature),)
126 + ten_feature = torch.from_numpy(np_feature.transpose((0,2,1))).float() # output type => torch.FloatTensor, fast
127 + # input size : (1, n_win=40, dim=40)
128 + # output size : (1, dim=40, n_win=40)
129 + return ten_feature
130 +
131 +class ToTensorTestInput(object):
132 + """Convert ndarrays in sample to Tensors."""
133 + def __call__(self, np_feature):
134 + """
135 + Args:
136 + feature (numpy.ndarray): feature to be converted to tensor.
137 + Returns:
138 + Tensor: Converted feature.
139 + """
140 + if isinstance(np_feature, np.ndarray):
141 + # handle numpy array
142 + np_feature = np.expand_dims(np_feature, axis=0)
143 + np_feature = np.expand_dims(np_feature, axis=1)
144 + assert np_feature.ndim == 4, 'Data is not a 4D tensor. size:%s' %(np.shape(np_feature),)
145 + ten_feature = torch.from_numpy(np_feature.transpose((0,1,3,2))).float() # output type => torch.FloatTensor, fast
146 + # input size : (1, 1, n_win=200, dim=40)
147 + # output size : (1, 1, dim=40, n_win=200)
148 + return ten_feature
149 +
150 +def collate_fn_feat_padded(batch):
151 + """
152 + Sort a data list by frame length (descending order)
153 + batch : list of tuple (feature, label). len(batch) = batch_size
154 + - feature : torch tensor of shape [1, 40, 80] ; variable size of frames
155 + - labels : torch tensor of shape (1)
156 + ex) samples = collate_fn([batch])
157 + batch = [dataset[i] for i in batch_indices]. ex) [Dvector_train_dataset[i] for i in [0,1,2,3,4]]
158 + batch[0][0].shape = torch.Size([1,64,774]). "774" is the number of frames per utterance.
159 +
160 + """
161 + batch.sort(key=lambda x: x[0].shape[2], reverse=True)
162 + feats, labels = zip(*batch)
163 +
164 + # Merge labels => torch.Size([batch_size,1])
165 + labels = torch.stack(labels, 0)
166 + labels = labels.view(-1)
167 +
168 + # Merge frames
169 + lengths = [feat.shape[2] for feat in feats] # in decreasing order
170 + max_length = lengths[0]
171 + # features_mod.shape => torch.Size([batch_size, n_channel, dim, max(n_win)])
172 + padded_features = torch.zeros(len(feats), feats[0].shape[0], feats[0].shape[1], feats[0].shape[2]).float() # convert to FloatTensor (it should be!). torch.Size([batch, 1, feat_dim, max(n_win)])
173 + for i, feat in enumerate(feats):
174 + end = lengths[i]
175 + num_frames = feat.shape[2]
176 + while max_length > num_frames:
177 + feat = torch.cat((feat, feat[:,:,:end]), 2)
178 + num_frames = feat.shape[2]
179 +
180 + padded_features[i, :, :, :] = feat[:,:,:max_length]
181 +
182 + return padded_features, labels
183 +
184 +class DvectorDataset(data.Dataset):
185 + def __init__(self, DB, loader, spk_to_idx, transform=None, *arg, **kw):
186 + self.DB = DB
187 + self.len = len(DB)
188 + self.transform = transform
189 + self.loader = loader
190 + self.spk_to_idx = spk_to_idx
191 +
192 + def __getitem__(self, index):
193 + feat_path = self.DB['filename'][index]
194 + feature, label = self.loader(feat_path)
195 + label = self.spk_to_idx[label]
196 + label = torch.Tensor([label]).long()
197 + if self.transform:
198 + feature = self.transform(feature)
199 +
200 + return feature, label
201 +
202 + def __len__(self):
203 + return self.len
204 +
205 +def main():
206 + train_DB = read_DB_structure(c.TRAIN_DATAROOT_DIR)
207 + transform = transforms.Compose([
208 + truncatedinputfromMFB(),
209 + totensor_DNN_input()
210 + ])
211 + file_loader = read_MFB
212 + speaker_list = sorted(set(train_DB['speaker_id']))
213 + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
214 + batch_size = 128
215 + Dvector_train_dataset = Dvector_Dataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
216 + Dvector_train_loader = torch.utils.data.DataLoader(dataset=Dvector_train_dataset,
217 + batch_size=batch_size,
218 + shuffle=False)
219 +
220 +if __name__ == '__main__':
221 + main()
...\ No newline at end of file ...\ No newline at end of file
1 +# Wave path
2 +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
3 +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
4 +TEST_WAV_DIR = 'test_wavs'
5 +
6 +# Feature path
7 +TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train'
8 +TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
9 +
10 +# Context window size
11 +NUM_WIN_SIZE = 100 #10
12 +
13 +# Settings for feature extraction
14 +USE_LOGSCALE = True
15 +USE_DELTA = False
16 +USE_SCALE = False
17 +SAMPLE_RATE = 16000
18 +FILTER_BANK = 40
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def get_embeddings(use_cuda, filename, model, test_frames):
40 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
41 +
42 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
43 + activation = 0
44 + with torch.no_grad():
45 + for i in range(tot_segments):
46 + temp_input = input[i*test_frames:i*test_frames+test_frames]
47 +
48 + TT = ToTensorTestInput()
49 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
50 +
51 + if use_cuda:
52 + temp_input = temp_input.cuda()
53 + temp_activation,_ = model(temp_input)
54 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
55 +
56 + activation = l2_norm(activation, 1)
57 +
58 + return activation
59 +
60 +def l2_norm(input, alpha):
61 + input_size = input.size() # size:(n_frames, dim)
62 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
63 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
64 + norm = torch.sqrt(normp) # size:(n_frames)
65 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
66 + output = _output.view(input_size)
67 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
68 + output = output * alpha
69 + return output
70 +
71 +def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
72 + """
73 + Output the averaged d-vector for each speaker (enrollment)
74 + Return the dictionary (length of n_spk)
75 + """
76 + n_files = len(DB) # 10
77 + enroll_speaker_list = sorted(set(DB['speaker_id']))
78 +
79 + embeddings = {}
80 +
81 + # Aggregates all the activations
82 + print("Start to aggregate all the d-vectors per enroll speaker")
83 +
84 + for i in range(n_files):
85 + filename = DB['filename'][i]
86 + spk = DB['speaker_id'][i]
87 +
88 + activation = get_embeddings(use_cuda, filename, model, test_frames)
89 + if spk in embeddings:
90 + embeddings[spk] += activation
91 + else:
92 + embeddings[spk] = activation
93 +
94 + print("Aggregates the activation (spk : %s)" % (spk))
95 +
96 + if not os.path.exists(embedding_dir):
97 + os.makedirs(embedding_dir)
98 +
99 + # Save the embeddings
100 + for spk_index in enroll_speaker_list:
101 + embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
102 + torch.save(embeddings[spk_index], embedding_path)
103 + print("Save the embeddings for %s" % (spk_index))
104 + return embeddings
105 +
106 +def main():
107 +
108 + # Settings
109 + use_cuda = True
110 + log_dir = 'model_saved'
111 + embedding_size = 128
112 + cp_num = 24 # Which checkpoint to use?
113 + n_classes = 240
114 + test_frames = 200
115 +
116 + # Load model from checkpoint
117 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
118 +
119 + # Get the dataframe for enroll DB
120 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
121 +
122 + # Where to save embeddings
123 + embedding_dir = 'enroll_embeddings'
124 +
125 + # Perform the enrollment and save the results
126 + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
127 +
128 + """ Test speaker list
129 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
130 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
131 + """
132 +
133 +if __name__ == '__main__':
134 + main()
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'model_saved' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 24 # Which checkpoint to use?
108 + n_classes = 240 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063']
127 +
128 + # Set the test speaker
129 + test_speaker = '230M4087'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet18'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(128, embedding_size)
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 +
37 + x = self.pretrained.layer1(x)
38 + x = self.pretrained.layer2(x)
39 + x = self.pretrained.layer3(x)
40 + x = self.pretrained.layer4(x)
41 +
42 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
43 + out = torch.squeeze(out) # [batch, n_embed]
44 + # flatten the out so that the fully connected layer can be connected from here
45 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
46 + spk_embedding = self.fc0(out)
47 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
48 + out = self.last(out)
49 +
50 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
1 +"""Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
2 +and added support for the 1x32x32 mel spectrogram for the speech recognition.
3 +Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition
4 +https://arxiv.org/abs/1512.03385
5 +"""
6 +
7 +import torch.nn as nn
8 +import math
9 +import torch.utils.model_zoo as model_zoo
10 +
11 +
12 +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
13 + 'resnet152']
14 +
15 +
16 +model_urls = {
17 + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
18 + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
19 + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
20 + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
21 + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
22 +}
23 +
24 +
25 +def conv3x3(in_planes, out_planes, stride=1):
26 + """3x3 convolution with padding"""
27 + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
28 + padding=1, bias=False)
29 +
30 +
31 +class BasicBlock(nn.Module):
32 + expansion = 1
33 +
34 + def __init__(self, inplanes, planes, stride=1, downsample=None):
35 + super(BasicBlock, self).__init__()
36 + self.conv1 = conv3x3(inplanes, planes, stride)
37 + self.bn1 = nn.BatchNorm2d(planes)
38 + self.relu = nn.ReLU(inplace=True)
39 + self.conv2 = conv3x3(planes, planes)
40 + self.bn2 = nn.BatchNorm2d(planes)
41 + self.downsample = downsample
42 + self.stride = stride
43 +
44 + def forward(self, x):
45 + residual = x
46 +
47 + out = self.conv1(x)
48 + out = self.bn1(out)
49 + out = self.relu(out)
50 +
51 + out = self.conv2(out)
52 + out = self.bn2(out)
53 +
54 + if self.downsample is not None:
55 + residual = self.downsample(x)
56 +
57 + out += residual
58 + out = self.relu(out)
59 +
60 + return out
61 +
62 +
63 +class Bottleneck(nn.Module):
64 + expansion = 4
65 +
66 + def __init__(self, inplanes, planes, stride=1, downsample=None):
67 + super(Bottleneck, self).__init__()
68 + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
69 + self.bn1 = nn.BatchNorm2d(planes)
70 + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
71 + padding=1, bias=False)
72 + self.bn2 = nn.BatchNorm2d(planes)
73 + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
74 + self.bn3 = nn.BatchNorm2d(planes * 4)
75 + self.relu = nn.ReLU(inplace=True)
76 + self.downsample = downsample
77 + self.stride = stride
78 +
79 + def forward(self, x):
80 + residual = x
81 +
82 + out = self.conv1(x)
83 + out = self.bn1(out)
84 + out = self.relu(out)
85 +
86 + out = self.conv2(out)
87 + out = self.bn2(out)
88 + out = self.relu(out)
89 +
90 + out = self.conv3(out)
91 + out = self.bn3(out)
92 +
93 + if self.downsample is not None:
94 + residual = self.downsample(x)
95 +
96 + out += residual
97 + out = self.relu(out)
98 +
99 + return out
100 +
101 +
102 +class ResNet(nn.Module):
103 +
104 + def __init__(self, block, layers, num_classes=1000, in_channels=1):
105 + self.inplanes = 16
106 + super(ResNet, self).__init__()
107 + self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3,
108 + bias=False) # ori : stride = 2
109 + self.bn1 = nn.BatchNorm2d(16)
110 + self.relu = nn.ReLU(inplace=True)
111 + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
112 + self.layer1 = self._make_layer(block, 16, layers[0])
113 + self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
114 + self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
115 + self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
116 + self.avgpool = nn.AvgPool2d(1, stride=1)
117 + self.fc = nn.Linear(128 * block.expansion, num_classes)
118 +
119 + for m in self.modules():
120 + if isinstance(m, nn.Conv2d):
121 + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
122 + m.weight.data.normal_(0, math.sqrt(2. / n))
123 + elif isinstance(m, nn.BatchNorm2d):
124 + m.weight.data.fill_(1)
125 + m.bias.data.zero_()
126 +
127 + def _make_layer(self, block, planes, blocks, stride=1):
128 + downsample = None
129 + if stride != 1 or self.inplanes != planes * block.expansion:
130 + downsample = nn.Sequential(
131 + nn.Conv2d(self.inplanes, planes * block.expansion,
132 + kernel_size=1, stride=stride, bias=False),
133 + nn.BatchNorm2d(planes * block.expansion),
134 + )
135 +
136 + layers = []
137 + layers.append(block(self.inplanes, planes, stride, downsample))
138 + self.inplanes = planes * block.expansion
139 + for i in range(1, blocks):
140 + layers.append(block(self.inplanes, planes))
141 +
142 + return nn.Sequential(*layers)
143 +
144 + def forward(self, x):
145 + x = self.conv1(x)
146 + x = self.bn1(x)
147 + x = self.relu(x)
148 + x = self.maxpool(x)
149 +
150 + x = self.layer1(x)
151 + x = self.layer2(x)
152 + x = self.layer3(x)
153 + x = self.layer4(x)
154 +
155 + x = self.avgpool(x)
156 + x = x.view(x.size(0), -1)
157 + x = self.fc(x)
158 +
159 + return x
160 +
161 +
162 +def resnet18(pretrained=False, **kwargs):
163 + """Constructs a ResNet-18 model.
164 + Args:
165 + pretrained (bool): If True, returns a model pre-trained on ImageNet
166 + """
167 + model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
168 + if pretrained:
169 + model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
170 + return model
171 +
172 +
173 +def resnet34(pretrained=False, **kwargs):
174 + """Constructs a ResNet-34 model.
175 + Args:
176 + pretrained (bool): If True, returns a model pre-trained on ImageNet
177 + """
178 + model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
179 + if pretrained:
180 + model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
181 + return model
182 +
183 +
184 +def resnet50(pretrained=False, **kwargs):
185 + """Constructs a ResNet-50 model.
186 + Args:
187 + pretrained (bool): If True, returns a model pre-trained on ImageNet
188 + """
189 + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
190 + if pretrained:
191 + model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
192 + return model
193 +
194 +
195 +def resnet101(pretrained=False, **kwargs):
196 + """Constructs a ResNet-101 model.
197 + Args:
198 + pretrained (bool): If True, returns a model pre-trained on ImageNet
199 + """
200 + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
201 + if pretrained:
202 + model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
203 + return model
204 +
205 +
206 +def resnet152(pretrained=False, **kwargs):
207 + """Constructs a ResNet-152 model.
208 + Args:
209 + pretrained (bool): If True, returns a model pre-trained on ImageNet
210 + """
211 + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
212 + if pretrained:
213 + model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
214 + return model
...\ No newline at end of file ...\ No newline at end of file
This diff is collapsed. Click to expand it.
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'model_saved' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 24 # Which checkpoint to use?
109 + n_classes = 240 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '230M4087'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '230M4087'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
...\ No newline at end of file ...\ No newline at end of file