Showing
26 changed files
with
722 additions
and
2 deletions
... | @@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs' | ... | @@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs' |
5 | 5 | ||
6 | # Feature path | 6 | # Feature path |
7 | TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train' | 7 | TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train' |
8 | +# TRAIN_FEAT_DIR = '/test/merge_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
8 | TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | 10 | TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' |
9 | 11 | ||
10 | # Context window size | 12 | # Context window size | ... | ... |
Speaker_Recognition/configure1_merge.py
0 → 100644
1 | +# Wave path | ||
2 | +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train' | ||
3 | +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev' | ||
4 | +TEST_WAV_DIR = 'test_wavs' | ||
5 | + | ||
6 | +# Feature path | ||
7 | +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003' | ||
8 | +TRAIN_FEAT_DIR = '/test/merge_train_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
10 | +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | ||
11 | +TEST_FEAT_DIR = '/test/merge_test_dataset' | ||
12 | +# Context window size | ||
13 | +NUM_WIN_SIZE = 100 #10 | ||
14 | + | ||
15 | +# Settings for feature extraction | ||
16 | +USE_LOGSCALE = True | ||
17 | +USE_DELTA = False | ||
18 | +USE_SCALE = False | ||
19 | +SAMPLE_RATE = 16000 | ||
20 | +FILTER_BANK = 40 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/configure1_zeroth.py
0 → 100644
1 | +# Wave path | ||
2 | +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train' | ||
3 | +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev' | ||
4 | +TEST_WAV_DIR = 'test_wavs' | ||
5 | + | ||
6 | +# Feature path | ||
7 | +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003' | ||
8 | +TRAIN_FEAT_DIR = '/test/zeroth_train_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
10 | +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | ||
11 | +TEST_FEAT_DIR = '/test/zeroth_test_dataset' | ||
12 | +# Context window size | ||
13 | +NUM_WIN_SIZE = 100 #10 | ||
14 | + | ||
15 | +# Settings for feature extraction | ||
16 | +USE_LOGSCALE = True | ||
17 | +USE_DELTA = False | ||
18 | +USE_SCALE = False | ||
19 | +SAMPLE_RATE = 16000 | ||
20 | +FILTER_BANK = 40 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/enroll1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
40 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
41 | + | ||
42 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
43 | + activation = 0 | ||
44 | + with torch.no_grad(): | ||
45 | + for i in range(tot_segments): | ||
46 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
47 | + | ||
48 | + TT = ToTensorTestInput() | ||
49 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
50 | + | ||
51 | + if use_cuda: | ||
52 | + temp_input = temp_input.cuda() | ||
53 | + temp_activation,_ = model(temp_input) | ||
54 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
55 | + | ||
56 | + activation = l2_norm(activation, 1) | ||
57 | + | ||
58 | + return activation | ||
59 | + | ||
60 | +def l2_norm(input, alpha): | ||
61 | + input_size = input.size() # size:(n_frames, dim) | ||
62 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
63 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
64 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
65 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
66 | + output = _output.view(input_size) | ||
67 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
68 | + output = output * alpha | ||
69 | + return output | ||
70 | + | ||
71 | +def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir): | ||
72 | + """ | ||
73 | + Output the averaged d-vector for each speaker (enrollment) | ||
74 | + Return the dictionary (length of n_spk) | ||
75 | + """ | ||
76 | + n_files = len(DB) # 10 | ||
77 | + enroll_speaker_list = sorted(set(DB['speaker_id'])) | ||
78 | + | ||
79 | + embeddings = {} | ||
80 | + | ||
81 | + # Aggregates all the activations | ||
82 | + print("Start to aggregate all the d-vectors per enroll speaker") | ||
83 | + | ||
84 | + for i in range(n_files): | ||
85 | + filename = DB['filename'][i] | ||
86 | + spk = DB['speaker_id'][i] | ||
87 | + | ||
88 | + activation = get_embeddings(use_cuda, filename, model, test_frames) | ||
89 | + if spk in embeddings: | ||
90 | + embeddings[spk] += activation | ||
91 | + else: | ||
92 | + embeddings[spk] = activation | ||
93 | + | ||
94 | + print("Aggregates the activation (spk : %s)" % (spk)) | ||
95 | + | ||
96 | + if not os.path.exists(embedding_dir): | ||
97 | + os.makedirs(embedding_dir) | ||
98 | + | ||
99 | + # Save the embeddings | ||
100 | + for spk_index in enroll_speaker_list: | ||
101 | + embedding_path = os.path.join(embedding_dir, spk_index+'.pth') | ||
102 | + torch.save(embeddings[spk_index], embedding_path) | ||
103 | + print("Save the embeddings for %s" % (spk_index)) | ||
104 | + return embeddings | ||
105 | + | ||
106 | +def main(): | ||
107 | + | ||
108 | + # Settings | ||
109 | + use_cuda = True | ||
110 | + log_dir = 'new_model1' | ||
111 | + embedding_size = 128 | ||
112 | + cp_num = 24 # Which checkpoint to use? | ||
113 | + n_classes = 241 | ||
114 | + test_frames = 200 | ||
115 | + | ||
116 | + # Load model from checkpoint | ||
117 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
118 | + | ||
119 | + # Get the dataframe for enroll DB | ||
120 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
121 | + | ||
122 | + # Where to save embeddings | ||
123 | + embedding_dir = 'enroll_embeddings1' | ||
124 | + | ||
125 | + # Perform the enrollment and save the results | ||
126 | + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir) | ||
127 | + | ||
128 | + """ Test speaker list | ||
129 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
130 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
131 | + """ | ||
132 | + | ||
133 | +if __name__ == '__main__': | ||
134 | + main() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
... | @@ -123,10 +123,10 @@ def main(): | ... | @@ -123,10 +123,10 @@ def main(): |
123 | """ | 123 | """ |
124 | 124 | ||
125 | spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | 125 | spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ |
126 | - '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'] | 126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] |
127 | 127 | ||
128 | # Set the test speaker | 128 | # Set the test speaker |
129 | - test_speaker = '230M4087' | 129 | + test_speaker = '778M8777' |
130 | 130 | ||
131 | test_path = os.path.join(test_dir, test_speaker, 'test.p') | 131 | test_path = os.path.join(test_dir, test_speaker, 'test.p') |
132 | 132 | ... | ... |
Speaker_Recognition/identification1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list): | ||
83 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
84 | + max_score = -10**8 | ||
85 | + best_spk = None | ||
86 | + for spk in spk_list: | ||
87 | + score = F.cosine_similarity(test_embedding, embeddings[spk]) | ||
88 | + score = score.data.cpu().numpy() | ||
89 | + if score > max_score: | ||
90 | + max_score = score | ||
91 | + best_spk = spk | ||
92 | + #print("Speaker identification result : %s" %best_spk) | ||
93 | + true_spk = test_filename.split('/')[-2].split('_')[0] | ||
94 | + print("\n=== Speaker identification ===") | ||
95 | + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk)) | ||
96 | + return best_spk | ||
97 | + | ||
98 | +def main(): | ||
99 | + | ||
100 | + log_dir = 'new_model1' # Where the checkpoints are saved | ||
101 | + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved | ||
102 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
103 | + | ||
104 | + # Settings | ||
105 | + use_cuda = True # Use cuda or not | ||
106 | + embedding_size = 128 # Dimension of speaker embeddings | ||
107 | + cp_num = 30 # Which checkpoint to use? | ||
108 | + n_classes = 241 # How many speakers in training data? | ||
109 | + test_frames = 100 # Split the test utterance | ||
110 | + | ||
111 | + # Load model from checkpoint | ||
112 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
113 | + | ||
114 | + # Get the dataframe for test DB | ||
115 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
116 | + | ||
117 | + # Load enroll embeddings | ||
118 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
119 | + | ||
120 | + """ Test speaker list | ||
121 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
122 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
123 | + """ | ||
124 | + | ||
125 | + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] | ||
127 | + | ||
128 | + # Set the test speaker | ||
129 | + test_speaker = '213F5100' | ||
130 | + | ||
131 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
132 | + | ||
133 | + # Perform the test | ||
134 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
135 | + | ||
136 | +if __name__ == '__main__': | ||
137 | + main() |
Speaker_Recognition/model/model1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.nn.functional as F | ||
4 | +from torch.autograd import Function | ||
5 | +import model.resnet as resnet | ||
6 | + | ||
7 | + | ||
8 | +class background_resnet(nn.Module): | ||
9 | + def __init__(self, embedding_size, num_classes, backbone='resnet18'): | ||
10 | + super(background_resnet, self).__init__() | ||
11 | + self.backbone = backbone | ||
12 | + # copying modules from pretrained models | ||
13 | + if backbone == 'resnet50': | ||
14 | + self.pretrained = resnet.resnet50(pretrained=False) | ||
15 | + elif backbone == 'resnet101': | ||
16 | + self.pretrained = resnet.resnet101(pretrained=False) | ||
17 | + elif backbone == 'resnet152': | ||
18 | + self.pretrained = resnet.resnet152(pretrained=False) | ||
19 | + elif backbone == 'resnet18': | ||
20 | + self.pretrained = resnet.resnet18(pretrained=False) | ||
21 | + elif backbone == 'resnet34': | ||
22 | + self.pretrained = resnet.resnet34(pretrained=False) | ||
23 | + else: | ||
24 | + raise RuntimeError('unknown backbone: {}'.format(backbone)) | ||
25 | + | ||
26 | + self.fc0 = nn.Linear(128, embedding_size) | ||
27 | + self.bn0 = nn.BatchNorm1d(embedding_size) | ||
28 | + self.relu = nn.ReLU() | ||
29 | + self.last = nn.Linear(embedding_size, num_classes) | ||
30 | + | ||
31 | + def forward(self, x): | ||
32 | + # input x: minibatch x 1 x 40 x 40 | ||
33 | + x = self.pretrained.conv1(x) | ||
34 | + x = self.pretrained.bn1(x) | ||
35 | + x = self.pretrained.relu(x) | ||
36 | + | ||
37 | + x = self.pretrained.layer1(x) | ||
38 | + x = self.pretrained.layer2(x) | ||
39 | + x = self.pretrained.layer3(x) | ||
40 | + x = self.pretrained.layer4(x) | ||
41 | + | ||
42 | + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1] | ||
43 | + out = torch.squeeze(out) # [batch, n_embed] | ||
44 | + # flatten the out so that the fully connected layer can be connected from here | ||
45 | + out = out.view(x.size(0), -1) # (n_batch, n_embed) | ||
46 | + spk_embedding = self.fc0(out) | ||
47 | + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed] | ||
48 | + out = self.last(out) | ||
49 | + | ||
50 | + return spk_embedding, out | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -113,6 +113,7 @@ class ResNet(nn.Module): | ... | @@ -113,6 +113,7 @@ class ResNet(nn.Module): |
113 | self.layer2 = self._make_layer(block, 32, layers[1], stride=2) | 113 | self.layer2 = self._make_layer(block, 32, layers[1], stride=2) |
114 | self.layer3 = self._make_layer(block, 64, layers[2], stride=2) | 114 | self.layer3 = self._make_layer(block, 64, layers[2], stride=2) |
115 | self.layer4 = self._make_layer(block, 128, layers[3], stride=2) | 115 | self.layer4 = self._make_layer(block, 128, layers[3], stride=2) |
116 | + | ||
116 | self.avgpool = nn.AvgPool2d(1, stride=1) | 117 | self.avgpool = nn.AvgPool2d(1, stride=1) |
117 | self.fc = nn.Linear(128 * block.expansion, num_classes) | 118 | self.fc = nn.Linear(128 * block.expansion, num_classes) |
118 | 119 | ... | ... |
Speaker_Recognition/model/resnet1.py
0 → 100644
1 | +"""Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py | ||
2 | +and added support for the 1x32x32 mel spectrogram for the speech recognition. | ||
3 | +Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition | ||
4 | +https://arxiv.org/abs/1512.03385 | ||
5 | +""" | ||
6 | + | ||
7 | +import torch.nn as nn | ||
8 | +import math | ||
9 | +import torch.utils.model_zoo as model_zoo | ||
10 | + | ||
11 | + | ||
12 | +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', | ||
13 | + 'resnet152'] | ||
14 | + | ||
15 | + | ||
16 | +model_urls = { | ||
17 | + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', | ||
18 | + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', | ||
19 | + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', | ||
20 | + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', | ||
21 | + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', | ||
22 | +} | ||
23 | + | ||
24 | + | ||
25 | +def conv3x3(in_planes, out_planes, stride=1): | ||
26 | + """3x3 convolution with padding""" | ||
27 | + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, | ||
28 | + padding=1, bias=False) | ||
29 | + | ||
30 | + | ||
31 | +class BasicBlock(nn.Module): | ||
32 | + expansion = 1 | ||
33 | + | ||
34 | + def __init__(self, inplanes, planes, stride=1, downsample=None): | ||
35 | + super(BasicBlock, self).__init__() | ||
36 | + self.conv1 = conv3x3(inplanes, planes, stride) | ||
37 | + self.bn1 = nn.BatchNorm2d(planes) | ||
38 | + self.relu = nn.ReLU(inplace=True) | ||
39 | + self.conv2 = conv3x3(planes, planes) | ||
40 | + self.bn2 = nn.BatchNorm2d(planes) | ||
41 | + self.downsample = downsample | ||
42 | + self.stride = stride | ||
43 | + | ||
44 | + def forward(self, x): | ||
45 | + residual = x | ||
46 | + | ||
47 | + out = self.conv1(x) | ||
48 | + out = self.bn1(out) | ||
49 | + out = self.relu(out) | ||
50 | + | ||
51 | + out = self.conv2(out) | ||
52 | + out = self.bn2(out) | ||
53 | + | ||
54 | + if self.downsample is not None: | ||
55 | + residual = self.downsample(x) | ||
56 | + | ||
57 | + out += residual | ||
58 | + out = self.relu(out) | ||
59 | + | ||
60 | + return out | ||
61 | + | ||
62 | + | ||
63 | +class Bottleneck(nn.Module): | ||
64 | + expansion = 4 | ||
65 | + | ||
66 | + def __init__(self, inplanes, planes, stride=1, downsample=None): | ||
67 | + super(Bottleneck, self).__init__() | ||
68 | + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) | ||
69 | + self.bn1 = nn.BatchNorm2d(planes) | ||
70 | + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, | ||
71 | + padding=1, bias=False) | ||
72 | + self.bn2 = nn.BatchNorm2d(planes) | ||
73 | + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) | ||
74 | + self.bn3 = nn.BatchNorm2d(planes * 4) | ||
75 | + self.relu = nn.ReLU(inplace=True) | ||
76 | + self.downsample = downsample | ||
77 | + self.stride = stride | ||
78 | + | ||
79 | + def forward(self, x): | ||
80 | + residual = x | ||
81 | + | ||
82 | + out = self.conv1(x) | ||
83 | + out = self.bn1(out) | ||
84 | + out = self.relu(out) | ||
85 | + | ||
86 | + out = self.conv2(out) | ||
87 | + out = self.bn2(out) | ||
88 | + out = self.relu(out) | ||
89 | + | ||
90 | + out = self.conv3(out) | ||
91 | + out = self.bn3(out) | ||
92 | + | ||
93 | + if self.downsample is not None: | ||
94 | + residual = self.downsample(x) | ||
95 | + | ||
96 | + out += residual | ||
97 | + out = self.relu(out) | ||
98 | + | ||
99 | + return out | ||
100 | + | ||
101 | + | ||
102 | +class ResNet(nn.Module): | ||
103 | + | ||
104 | + def __init__(self, block, layers, num_classes=1000, in_channels=1): | ||
105 | + self.inplanes = 16 | ||
106 | + super(ResNet, self).__init__() | ||
107 | + self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3, | ||
108 | + bias=False) # ori : stride = 2 | ||
109 | + self.bn1 = nn.BatchNorm2d(16) | ||
110 | + self.relu = nn.ReLU(inplace=True) | ||
111 | + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | ||
112 | + self.layer1 = self._make_layer(block, 16, layers[0]) | ||
113 | + self.layer2 = self._make_layer(block, 32, layers[1], stride=2) | ||
114 | + self.layer3 = self._make_layer(block, 64, layers[2], stride=2) | ||
115 | + self.layer4 = self._make_layer(block, 128, layers[3], stride=2) | ||
116 | + self.layer5 = self._make_layer(block, 256, layers[3], stride=2) | ||
117 | + self.avgpool = nn.AvgPool2d(1, stride=1) | ||
118 | + self.fc = nn.Linear(128 * block.expansion, num_classes) | ||
119 | + | ||
120 | + for m in self.modules(): | ||
121 | + if isinstance(m, nn.Conv2d): | ||
122 | + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | ||
123 | + m.weight.data.normal_(0, math.sqrt(2. / n)) | ||
124 | + elif isinstance(m, nn.BatchNorm2d): | ||
125 | + m.weight.data.fill_(1) | ||
126 | + m.bias.data.zero_() | ||
127 | + | ||
128 | + def _make_layer(self, block, planes, blocks, stride=1): | ||
129 | + downsample = None | ||
130 | + if stride != 1 or self.inplanes != planes * block.expansion: | ||
131 | + downsample = nn.Sequential( | ||
132 | + nn.Conv2d(self.inplanes, planes * block.expansion, | ||
133 | + kernel_size=1, stride=stride, bias=False), | ||
134 | + nn.BatchNorm2d(planes * block.expansion), | ||
135 | + ) | ||
136 | + | ||
137 | + layers = [] | ||
138 | + layers.append(block(self.inplanes, planes, stride, downsample)) | ||
139 | + self.inplanes = planes * block.expansion | ||
140 | + for i in range(1, blocks): | ||
141 | + layers.append(block(self.inplanes, planes)) | ||
142 | + | ||
143 | + return nn.Sequential(*layers) | ||
144 | + | ||
145 | + def forward(self, x): | ||
146 | + x = self.conv1(x) | ||
147 | + x = self.bn1(x) | ||
148 | + x = self.relu(x) | ||
149 | + x = self.maxpool(x) | ||
150 | + | ||
151 | + x = self.layer1(x) | ||
152 | + x = self.layer2(x) | ||
153 | + x = self.layer3(x) | ||
154 | + x = self.layer4(x) | ||
155 | + | ||
156 | + x = self.avgpool(x) | ||
157 | + x = x.view(x.size(0), -1) | ||
158 | + x = self.fc(x) | ||
159 | + | ||
160 | + return x | ||
161 | + | ||
162 | + | ||
163 | +def resnet18(pretrained=False, **kwargs): | ||
164 | + """Constructs a ResNet-18 model. | ||
165 | + Args: | ||
166 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
167 | + """ | ||
168 | + model = ResNet(BasicBlock, [2, 2, 2, 2, 2], **kwargs) | ||
169 | + if pretrained: | ||
170 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) | ||
171 | + return model | ||
172 | + | ||
173 | + | ||
174 | +def resnet34(pretrained=False, **kwargs): | ||
175 | + """Constructs a ResNet-34 model. | ||
176 | + Args: | ||
177 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
178 | + """ | ||
179 | + model = ResNet(BasicBlock, [3, 4, 6, 3, 3], **kwargs) | ||
180 | + if pretrained: | ||
181 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) | ||
182 | + return model | ||
183 | + | ||
184 | + | ||
185 | +def resnet50(pretrained=False, **kwargs): | ||
186 | + """Constructs a ResNet-50 model. | ||
187 | + Args: | ||
188 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
189 | + """ | ||
190 | + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) | ||
191 | + if pretrained: | ||
192 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) | ||
193 | + return model | ||
194 | + | ||
195 | + | ||
196 | +def resnet101(pretrained=False, **kwargs): | ||
197 | + """Constructs a ResNet-101 model. | ||
198 | + Args: | ||
199 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
200 | + """ | ||
201 | + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) | ||
202 | + if pretrained: | ||
203 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) | ||
204 | + return model | ||
205 | + | ||
206 | + | ||
207 | +def resnet152(pretrained=False, **kwargs): | ||
208 | + """Constructs a ResNet-152 model. | ||
209 | + Args: | ||
210 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
211 | + """ | ||
212 | + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) | ||
213 | + if pretrained: | ||
214 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) | ||
215 | + return model | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/train1.py
0 → 100644
This diff is collapsed. Click to expand it.
Speaker_Recognition/verification1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model1' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 29 # Which checkpoint to use? | ||
109 | + n_classes = 241 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = 'zerothfloac' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = 'zerothfloac' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
-
Please register or login to post a comment