Merge branch 'master' of http://khuhub.khu.ac.kr/2020-2-capstone-design1/LYG_project into master
Showing
16 changed files
with
798 additions
and
3 deletions
.idea/LYG_project.iml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<module type="JAVA_MODULE" version="4"> | ||
3 | + <component name="NewModuleRootManager" inherit-compiler-output="true"> | ||
4 | + <exclude-output /> | ||
5 | + <content url="file://$MODULE_DIR$" /> | ||
6 | + <orderEntry type="inheritedJdk" /> | ||
7 | + <orderEntry type="sourceFolder" forTests="false" /> | ||
8 | + </component> | ||
9 | +</module> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
.idea/misc.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project version="4"> | ||
3 | + <component name="ProjectRootManager" version="2" languageLevel="JDK_14" project-jdk-name="14" project-jdk-type="JavaSDK"> | ||
4 | + <output url="file://$PROJECT_DIR$/out" /> | ||
5 | + </component> | ||
6 | +</project> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
.idea/modules.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project version="4"> | ||
3 | + <component name="ProjectModuleManager"> | ||
4 | + <modules> | ||
5 | + <module fileurl="file://$PROJECT_DIR$/.idea/LYG_project.iml" filepath="$PROJECT_DIR$/.idea/LYG_project.iml" /> | ||
6 | + </modules> | ||
7 | + </component> | ||
8 | +</project> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
.idea/vcs.xml
0 → 100644
.idea/workspace.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project version="4"> | ||
3 | + <component name="ChangeListManager"> | ||
4 | + <list default="true" id="2b896c0d-f1d9-4424-819a-bc96fe07a387" name="Default Changelist" comment="" /> | ||
5 | + <option name="SHOW_DIALOG" value="false" /> | ||
6 | + <option name="HIGHLIGHT_CONFLICTS" value="true" /> | ||
7 | + <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> | ||
8 | + <option name="LAST_RESOLUTION" value="IGNORE" /> | ||
9 | + </component> | ||
10 | + <component name="Git.Settings"> | ||
11 | + <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" /> | ||
12 | + </component> | ||
13 | + <component name="MavenImportPreferences"> | ||
14 | + <option name="generalSettings"> | ||
15 | + <MavenGeneralSettings> | ||
16 | + <option name="mavenHome" value="C:\Program Files\JetBrains\IntelliJ IDEA Community Edition 2020.2\plugins\maven\lib\maven3" /> | ||
17 | + </MavenGeneralSettings> | ||
18 | + </option> | ||
19 | + </component> | ||
20 | + <component name="ProjectId" id="1kPBM4RUtUJvLFOYYSRQMuBxazR" /> | ||
21 | + <component name="ProjectViewState"> | ||
22 | + <option name="hideEmptyMiddlePackages" value="true" /> | ||
23 | + <option name="showLibraryContents" value="true" /> | ||
24 | + </component> | ||
25 | + <component name="PropertiesComponent"> | ||
26 | + <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" /> | ||
27 | + <property name="last_opened_file_path" value="$PROJECT_DIR$/../../../../spring/Myprj" /> | ||
28 | + </component> | ||
29 | + <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" /> | ||
30 | + <component name="TaskManager"> | ||
31 | + <task active="true" id="Default" summary="Default task"> | ||
32 | + <changelist id="2b896c0d-f1d9-4424-819a-bc96fe07a387" name="Default Changelist" comment="" /> | ||
33 | + <created>1605592296286</created> | ||
34 | + <option name="number" value="Default" /> | ||
35 | + <option name="presentableId" value="Default" /> | ||
36 | + <updated>1605592296286</updated> | ||
37 | + </task> | ||
38 | + <servers /> | ||
39 | + </component> | ||
40 | + <component name="WindowStateProjectService"> | ||
41 | + <state x="740" y="275" key="FileChooserDialogImpl" timestamp="1605592333707"> | ||
42 | + <screen x="0" y="0" width="1920" height="1040" /> | ||
43 | + </state> | ||
44 | + <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/-1920.65.1920.1040@0.0.1920.1040" timestamp="1605592333707" /> | ||
45 | + </component> | ||
46 | +</project> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs' | ... | @@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs' |
5 | 5 | ||
6 | # Feature path | 6 | # Feature path |
7 | TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train' | 7 | TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train' |
8 | +# TRAIN_FEAT_DIR = '/test/merge_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
8 | TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | 10 | TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' |
9 | 11 | ||
10 | # Context window size | 12 | # Context window size | ... | ... |
Speaker_Recognition/configure1_merge.py
0 → 100644
1 | +# Wave path | ||
2 | +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train' | ||
3 | +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev' | ||
4 | +TEST_WAV_DIR = 'test_wavs' | ||
5 | + | ||
6 | +# Feature path | ||
7 | +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003' | ||
8 | +TRAIN_FEAT_DIR = '/test/merge_train_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
10 | +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | ||
11 | +TEST_FEAT_DIR = '/test/merge_test_dataset' | ||
12 | +# Context window size | ||
13 | +NUM_WIN_SIZE = 100 #10 | ||
14 | + | ||
15 | +# Settings for feature extraction | ||
16 | +USE_LOGSCALE = True | ||
17 | +USE_DELTA = False | ||
18 | +USE_SCALE = False | ||
19 | +SAMPLE_RATE = 16000 | ||
20 | +FILTER_BANK = 40 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/configure1_zeroth.py
0 → 100644
1 | +# Wave path | ||
2 | +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train' | ||
3 | +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev' | ||
4 | +TEST_WAV_DIR = 'test_wavs' | ||
5 | + | ||
6 | +# Feature path | ||
7 | +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003' | ||
8 | +TRAIN_FEAT_DIR = '/test/zeroth_train_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
10 | +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | ||
11 | +TEST_FEAT_DIR = '/test/zeroth_test_dataset' | ||
12 | +# Context window size | ||
13 | +NUM_WIN_SIZE = 100 #10 | ||
14 | + | ||
15 | +# Settings for feature extraction | ||
16 | +USE_LOGSCALE = True | ||
17 | +USE_DELTA = False | ||
18 | +USE_SCALE = False | ||
19 | +SAMPLE_RATE = 16000 | ||
20 | +FILTER_BANK = 40 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/enroll1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
40 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
41 | + | ||
42 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
43 | + activation = 0 | ||
44 | + with torch.no_grad(): | ||
45 | + for i in range(tot_segments): | ||
46 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
47 | + | ||
48 | + TT = ToTensorTestInput() | ||
49 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
50 | + | ||
51 | + if use_cuda: | ||
52 | + temp_input = temp_input.cuda() | ||
53 | + temp_activation,_ = model(temp_input) | ||
54 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
55 | + | ||
56 | + activation = l2_norm(activation, 1) | ||
57 | + | ||
58 | + return activation | ||
59 | + | ||
60 | +def l2_norm(input, alpha): | ||
61 | + input_size = input.size() # size:(n_frames, dim) | ||
62 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
63 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
64 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
65 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
66 | + output = _output.view(input_size) | ||
67 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
68 | + output = output * alpha | ||
69 | + return output | ||
70 | + | ||
71 | +def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir): | ||
72 | + """ | ||
73 | + Output the averaged d-vector for each speaker (enrollment) | ||
74 | + Return the dictionary (length of n_spk) | ||
75 | + """ | ||
76 | + n_files = len(DB) # 10 | ||
77 | + enroll_speaker_list = sorted(set(DB['speaker_id'])) | ||
78 | + | ||
79 | + embeddings = {} | ||
80 | + | ||
81 | + # Aggregates all the activations | ||
82 | + print("Start to aggregate all the d-vectors per enroll speaker") | ||
83 | + | ||
84 | + for i in range(n_files): | ||
85 | + filename = DB['filename'][i] | ||
86 | + spk = DB['speaker_id'][i] | ||
87 | + | ||
88 | + activation = get_embeddings(use_cuda, filename, model, test_frames) | ||
89 | + if spk in embeddings: | ||
90 | + embeddings[spk] += activation | ||
91 | + else: | ||
92 | + embeddings[spk] = activation | ||
93 | + | ||
94 | + print("Aggregates the activation (spk : %s)" % (spk)) | ||
95 | + | ||
96 | + if not os.path.exists(embedding_dir): | ||
97 | + os.makedirs(embedding_dir) | ||
98 | + | ||
99 | + # Save the embeddings | ||
100 | + for spk_index in enroll_speaker_list: | ||
101 | + embedding_path = os.path.join(embedding_dir, spk_index+'.pth') | ||
102 | + torch.save(embeddings[spk_index], embedding_path) | ||
103 | + print("Save the embeddings for %s" % (spk_index)) | ||
104 | + return embeddings | ||
105 | + | ||
106 | +def main(): | ||
107 | + | ||
108 | + # Settings | ||
109 | + use_cuda = True | ||
110 | + log_dir = 'new_model1' | ||
111 | + embedding_size = 128 | ||
112 | + cp_num = 24 # Which checkpoint to use? | ||
113 | + n_classes = 241 | ||
114 | + test_frames = 200 | ||
115 | + | ||
116 | + # Load model from checkpoint | ||
117 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
118 | + | ||
119 | + # Get the dataframe for enroll DB | ||
120 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
121 | + | ||
122 | + # Where to save embeddings | ||
123 | + embedding_dir = 'enroll_embeddings1' | ||
124 | + | ||
125 | + # Perform the enrollment and save the results | ||
126 | + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir) | ||
127 | + | ||
128 | + """ Test speaker list | ||
129 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
130 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
131 | + """ | ||
132 | + | ||
133 | +if __name__ == '__main__': | ||
134 | + main() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -123,10 +123,10 @@ def main(): | ... | @@ -123,10 +123,10 @@ def main(): |
123 | """ | 123 | """ |
124 | 124 | ||
125 | spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | 125 | spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ |
126 | - '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'] | 126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] |
127 | 127 | ||
128 | # Set the test speaker | 128 | # Set the test speaker |
129 | - test_speaker = '230M4087' | 129 | + test_speaker = '778M8777' |
130 | 130 | ||
131 | test_path = os.path.join(test_dir, test_speaker, 'test.p') | 131 | test_path = os.path.join(test_dir, test_speaker, 'test.p') |
132 | 132 | ||
... | @@ -134,4 +134,4 @@ def main(): | ... | @@ -134,4 +134,4 @@ def main(): |
134 | best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | 134 | best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) |
135 | 135 | ||
136 | if __name__ == '__main__': | 136 | if __name__ == '__main__': |
137 | - main() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
137 | + main() | ... | ... |
Speaker_Recognition/identification1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list): | ||
83 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
84 | + max_score = -10**8 | ||
85 | + best_spk = None | ||
86 | + for spk in spk_list: | ||
87 | + score = F.cosine_similarity(test_embedding, embeddings[spk]) | ||
88 | + score = score.data.cpu().numpy() | ||
89 | + if score > max_score: | ||
90 | + max_score = score | ||
91 | + best_spk = spk | ||
92 | + #print("Speaker identification result : %s" %best_spk) | ||
93 | + true_spk = test_filename.split('/')[-2].split('_')[0] | ||
94 | + print("\n=== Speaker identification ===") | ||
95 | + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk)) | ||
96 | + return best_spk | ||
97 | + | ||
98 | +def main(): | ||
99 | + | ||
100 | + log_dir = 'new_model1' # Where the checkpoints are saved | ||
101 | + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved | ||
102 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
103 | + | ||
104 | + # Settings | ||
105 | + use_cuda = True # Use cuda or not | ||
106 | + embedding_size = 128 # Dimension of speaker embeddings | ||
107 | + cp_num = 30 # Which checkpoint to use? | ||
108 | + n_classes = 241 # How many speakers in training data? | ||
109 | + test_frames = 100 # Split the test utterance | ||
110 | + | ||
111 | + # Load model from checkpoint | ||
112 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
113 | + | ||
114 | + # Get the dataframe for test DB | ||
115 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
116 | + | ||
117 | + # Load enroll embeddings | ||
118 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
119 | + | ||
120 | + """ Test speaker list | ||
121 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
122 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
123 | + """ | ||
124 | + | ||
125 | + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] | ||
127 | + | ||
128 | + # Set the test speaker | ||
129 | + test_speaker = '213F5100' | ||
130 | + | ||
131 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
132 | + | ||
133 | + # Perform the test | ||
134 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
135 | + | ||
136 | +if __name__ == '__main__': | ||
137 | + main() |
Speaker_Recognition/model/model1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.nn.functional as F | ||
4 | +from torch.autograd import Function | ||
5 | +import model.resnet as resnet | ||
6 | + | ||
7 | + | ||
8 | +class background_resnet(nn.Module): | ||
9 | + def __init__(self, embedding_size, num_classes, backbone='resnet18'): | ||
10 | + super(background_resnet, self).__init__() | ||
11 | + self.backbone = backbone | ||
12 | + # copying modules from pretrained models | ||
13 | + if backbone == 'resnet50': | ||
14 | + self.pretrained = resnet.resnet50(pretrained=False) | ||
15 | + elif backbone == 'resnet101': | ||
16 | + self.pretrained = resnet.resnet101(pretrained=False) | ||
17 | + elif backbone == 'resnet152': | ||
18 | + self.pretrained = resnet.resnet152(pretrained=False) | ||
19 | + elif backbone == 'resnet18': | ||
20 | + self.pretrained = resnet.resnet18(pretrained=False) | ||
21 | + elif backbone == 'resnet34': | ||
22 | + self.pretrained = resnet.resnet34(pretrained=False) | ||
23 | + else: | ||
24 | + raise RuntimeError('unknown backbone: {}'.format(backbone)) | ||
25 | + | ||
26 | + self.fc0 = nn.Linear(128, embedding_size) | ||
27 | + self.bn0 = nn.BatchNorm1d(embedding_size) | ||
28 | + self.relu = nn.ReLU() | ||
29 | + self.last = nn.Linear(embedding_size, num_classes) | ||
30 | + | ||
31 | + def forward(self, x): | ||
32 | + # input x: minibatch x 1 x 40 x 40 | ||
33 | + x = self.pretrained.conv1(x) | ||
34 | + x = self.pretrained.bn1(x) | ||
35 | + x = self.pretrained.relu(x) | ||
36 | + | ||
37 | + x = self.pretrained.layer1(x) | ||
38 | + x = self.pretrained.layer2(x) | ||
39 | + x = self.pretrained.layer3(x) | ||
40 | + x = self.pretrained.layer4(x) | ||
41 | + | ||
42 | + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1] | ||
43 | + out = torch.squeeze(out) # [batch, n_embed] | ||
44 | + # flatten the out so that the fully connected layer can be connected from here | ||
45 | + out = out.view(x.size(0), -1) # (n_batch, n_embed) | ||
46 | + spk_embedding = self.fc0(out) | ||
47 | + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed] | ||
48 | + out = self.last(out) | ||
49 | + | ||
50 | + return spk_embedding, out | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -113,6 +113,7 @@ class ResNet(nn.Module): | ... | @@ -113,6 +113,7 @@ class ResNet(nn.Module): |
113 | self.layer2 = self._make_layer(block, 32, layers[1], stride=2) | 113 | self.layer2 = self._make_layer(block, 32, layers[1], stride=2) |
114 | self.layer3 = self._make_layer(block, 64, layers[2], stride=2) | 114 | self.layer3 = self._make_layer(block, 64, layers[2], stride=2) |
115 | self.layer4 = self._make_layer(block, 128, layers[3], stride=2) | 115 | self.layer4 = self._make_layer(block, 128, layers[3], stride=2) |
116 | + | ||
116 | self.avgpool = nn.AvgPool2d(1, stride=1) | 117 | self.avgpool = nn.AvgPool2d(1, stride=1) |
117 | self.fc = nn.Linear(128 * block.expansion, num_classes) | 118 | self.fc = nn.Linear(128 * block.expansion, num_classes) |
118 | 119 | ... | ... |
Speaker_Recognition/model/resnet1.py
0 → 100644
1 | +"""Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py | ||
2 | +and added support for the 1x32x32 mel spectrogram for the speech recognition. | ||
3 | +Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition | ||
4 | +https://arxiv.org/abs/1512.03385 | ||
5 | +""" | ||
6 | + | ||
7 | +import torch.nn as nn | ||
8 | +import math | ||
9 | +import torch.utils.model_zoo as model_zoo | ||
10 | + | ||
11 | + | ||
12 | +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', | ||
13 | + 'resnet152'] | ||
14 | + | ||
15 | + | ||
16 | +model_urls = { | ||
17 | + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', | ||
18 | + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', | ||
19 | + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', | ||
20 | + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', | ||
21 | + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', | ||
22 | +} | ||
23 | + | ||
24 | + | ||
25 | +def conv3x3(in_planes, out_planes, stride=1): | ||
26 | + """3x3 convolution with padding""" | ||
27 | + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, | ||
28 | + padding=1, bias=False) | ||
29 | + | ||
30 | + | ||
31 | +class BasicBlock(nn.Module): | ||
32 | + expansion = 1 | ||
33 | + | ||
34 | + def __init__(self, inplanes, planes, stride=1, downsample=None): | ||
35 | + super(BasicBlock, self).__init__() | ||
36 | + self.conv1 = conv3x3(inplanes, planes, stride) | ||
37 | + self.bn1 = nn.BatchNorm2d(planes) | ||
38 | + self.relu = nn.ReLU(inplace=True) | ||
39 | + self.conv2 = conv3x3(planes, planes) | ||
40 | + self.bn2 = nn.BatchNorm2d(planes) | ||
41 | + self.downsample = downsample | ||
42 | + self.stride = stride | ||
43 | + | ||
44 | + def forward(self, x): | ||
45 | + residual = x | ||
46 | + | ||
47 | + out = self.conv1(x) | ||
48 | + out = self.bn1(out) | ||
49 | + out = self.relu(out) | ||
50 | + | ||
51 | + out = self.conv2(out) | ||
52 | + out = self.bn2(out) | ||
53 | + | ||
54 | + if self.downsample is not None: | ||
55 | + residual = self.downsample(x) | ||
56 | + | ||
57 | + out += residual | ||
58 | + out = self.relu(out) | ||
59 | + | ||
60 | + return out | ||
61 | + | ||
62 | + | ||
63 | +class Bottleneck(nn.Module): | ||
64 | + expansion = 4 | ||
65 | + | ||
66 | + def __init__(self, inplanes, planes, stride=1, downsample=None): | ||
67 | + super(Bottleneck, self).__init__() | ||
68 | + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) | ||
69 | + self.bn1 = nn.BatchNorm2d(planes) | ||
70 | + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, | ||
71 | + padding=1, bias=False) | ||
72 | + self.bn2 = nn.BatchNorm2d(planes) | ||
73 | + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) | ||
74 | + self.bn3 = nn.BatchNorm2d(planes * 4) | ||
75 | + self.relu = nn.ReLU(inplace=True) | ||
76 | + self.downsample = downsample | ||
77 | + self.stride = stride | ||
78 | + | ||
79 | + def forward(self, x): | ||
80 | + residual = x | ||
81 | + | ||
82 | + out = self.conv1(x) | ||
83 | + out = self.bn1(out) | ||
84 | + out = self.relu(out) | ||
85 | + | ||
86 | + out = self.conv2(out) | ||
87 | + out = self.bn2(out) | ||
88 | + out = self.relu(out) | ||
89 | + | ||
90 | + out = self.conv3(out) | ||
91 | + out = self.bn3(out) | ||
92 | + | ||
93 | + if self.downsample is not None: | ||
94 | + residual = self.downsample(x) | ||
95 | + | ||
96 | + out += residual | ||
97 | + out = self.relu(out) | ||
98 | + | ||
99 | + return out | ||
100 | + | ||
101 | + | ||
102 | +class ResNet(nn.Module): | ||
103 | + | ||
104 | + def __init__(self, block, layers, num_classes=1000, in_channels=1): | ||
105 | + self.inplanes = 16 | ||
106 | + super(ResNet, self).__init__() | ||
107 | + self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3, | ||
108 | + bias=False) # ori : stride = 2 | ||
109 | + self.bn1 = nn.BatchNorm2d(16) | ||
110 | + self.relu = nn.ReLU(inplace=True) | ||
111 | + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | ||
112 | + self.layer1 = self._make_layer(block, 16, layers[0]) | ||
113 | + self.layer2 = self._make_layer(block, 32, layers[1], stride=2) | ||
114 | + self.layer3 = self._make_layer(block, 64, layers[2], stride=2) | ||
115 | + self.layer4 = self._make_layer(block, 128, layers[3], stride=2) | ||
116 | + self.layer5 = self._make_layer(block, 256, layers[3], stride=2) | ||
117 | + self.avgpool = nn.AvgPool2d(1, stride=1) | ||
118 | + self.fc = nn.Linear(128 * block.expansion, num_classes) | ||
119 | + | ||
120 | + for m in self.modules(): | ||
121 | + if isinstance(m, nn.Conv2d): | ||
122 | + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | ||
123 | + m.weight.data.normal_(0, math.sqrt(2. / n)) | ||
124 | + elif isinstance(m, nn.BatchNorm2d): | ||
125 | + m.weight.data.fill_(1) | ||
126 | + m.bias.data.zero_() | ||
127 | + | ||
128 | + def _make_layer(self, block, planes, blocks, stride=1): | ||
129 | + downsample = None | ||
130 | + if stride != 1 or self.inplanes != planes * block.expansion: | ||
131 | + downsample = nn.Sequential( | ||
132 | + nn.Conv2d(self.inplanes, planes * block.expansion, | ||
133 | + kernel_size=1, stride=stride, bias=False), | ||
134 | + nn.BatchNorm2d(planes * block.expansion), | ||
135 | + ) | ||
136 | + | ||
137 | + layers = [] | ||
138 | + layers.append(block(self.inplanes, planes, stride, downsample)) | ||
139 | + self.inplanes = planes * block.expansion | ||
140 | + for i in range(1, blocks): | ||
141 | + layers.append(block(self.inplanes, planes)) | ||
142 | + | ||
143 | + return nn.Sequential(*layers) | ||
144 | + | ||
145 | + def forward(self, x): | ||
146 | + x = self.conv1(x) | ||
147 | + x = self.bn1(x) | ||
148 | + x = self.relu(x) | ||
149 | + x = self.maxpool(x) | ||
150 | + | ||
151 | + x = self.layer1(x) | ||
152 | + x = self.layer2(x) | ||
153 | + x = self.layer3(x) | ||
154 | + x = self.layer4(x) | ||
155 | + | ||
156 | + x = self.avgpool(x) | ||
157 | + x = x.view(x.size(0), -1) | ||
158 | + x = self.fc(x) | ||
159 | + | ||
160 | + return x | ||
161 | + | ||
162 | + | ||
163 | +def resnet18(pretrained=False, **kwargs): | ||
164 | + """Constructs a ResNet-18 model. | ||
165 | + Args: | ||
166 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
167 | + """ | ||
168 | + model = ResNet(BasicBlock, [2, 2, 2, 2, 2], **kwargs) | ||
169 | + if pretrained: | ||
170 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) | ||
171 | + return model | ||
172 | + | ||
173 | + | ||
174 | +def resnet34(pretrained=False, **kwargs): | ||
175 | + """Constructs a ResNet-34 model. | ||
176 | + Args: | ||
177 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
178 | + """ | ||
179 | + model = ResNet(BasicBlock, [3, 4, 6, 3, 3], **kwargs) | ||
180 | + if pretrained: | ||
181 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) | ||
182 | + return model | ||
183 | + | ||
184 | + | ||
185 | +def resnet50(pretrained=False, **kwargs): | ||
186 | + """Constructs a ResNet-50 model. | ||
187 | + Args: | ||
188 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
189 | + """ | ||
190 | + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) | ||
191 | + if pretrained: | ||
192 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) | ||
193 | + return model | ||
194 | + | ||
195 | + | ||
196 | +def resnet101(pretrained=False, **kwargs): | ||
197 | + """Constructs a ResNet-101 model. | ||
198 | + Args: | ||
199 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
200 | + """ | ||
201 | + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) | ||
202 | + if pretrained: | ||
203 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) | ||
204 | + return model | ||
205 | + | ||
206 | + | ||
207 | +def resnet152(pretrained=False, **kwargs): | ||
208 | + """Constructs a ResNet-152 model. | ||
209 | + Args: | ||
210 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
211 | + """ | ||
212 | + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) | ||
213 | + if pretrained: | ||
214 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) | ||
215 | + return model | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/train1.py
0 → 100644
This diff is collapsed. Click to expand it.
Speaker_Recognition/verification1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model1' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 29 # Which checkpoint to use? | ||
109 | + n_classes = 241 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = 'zerothfloac' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = 'zerothfloac' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
-
Please register or login to post a comment