장재혁

add server

from flask import Flask, request, send_file
from extract_feature4 import extract
from verification4_merge import load_model, load_enroll_embeddings,perform_verification
from identification4 import perform_identification
from enroll4_merge import split_enroll_and_test,enroll_per_spk
import os
import shutil
app = Flask(__name__)
log_dir = '../new_model4_merge' # Where the checkpoints are saved
embedding_dir = '../enroll_embeddings4_merge' # Where embeddings are saved
test_dir = '../feat_logfbank_nfilt40/test/' # Where test features are saved
# Settings
use_cuda = True # Use cuda or not
embedding_size = 128 # Dimension of speaker embeddings
cp_num = 50 # Which checkpoint to use?
n_classes = 348 # How many speakers in training data?
test_frames = 100 # Split the test utterance
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
embeddings = load_enroll_embeddings(embedding_dir)
test_path = './test.p'
spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063']
def enrollment():
try:
global embeddings
enroll_DB, test_DB = split_enroll_and_test(test_dir)
enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
embeddings = load_enroll_embeddings(embedding_dir)
except Exception as e:
print(e)
def verification(enroll_speaker):
test_speaker = 'TEST_SPEAKER'
thres = 0.95
# Perform the test
return perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
def identification():
best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
return best_spk
@app.route('/enroll', methods=['POST', "GET"])
def enroll_controller():
if request.method == 'POST':
f = request.files['file']
enroll_speaker = request.form['enroll_speaker']
print(f.name)
f.save('./myrequest_enroll.wav')
extract('./myrequest_enroll.wav',enroll_speaker)
new_path = '../feat_logfbank_nfilt40/test/'+enroll_speaker+'/'
os.mkdir(new_path)
shutil.move('./enroll.p',new_path+'enroll.p')
try:
enrollment()
spk_list.append(enroll_speaker)
return 'enroll_complete'
except:
return 'failed'
#return 'post'
return 'get'
@app.route('/verification', methods=['POST', "GET"])
def verfication_controller():
if request.method == 'POST':
f = request.files['file']
enroll_speaker = request.form['enroll_speaker']
print(f.name)
f.save('./myrequest.wav')
extract('./myrequest.wav')
speak, score = verification(enroll_speaker)
return score
#return 'post'
return 'get'
@app.route('/identification', methods=['POST', "GET"])
def identification_controller():
if request.method == 'POST':
f = request.files['file']
print(f.name)
f.save('./myrequest.wav')
extract('./myrequest.wav')
best_spk = identification()
return best_spk
#return 'post'
return 'get'
@app.route('/debugger', methods=['GET'])
def debugger():
return anything
@app.route('/robots.txt',methods=['GET'])
def antirobot():
return send_file('robots.txt')
if __name__ == '__main__':
app.run(host='0.0.0.0', port="7777", debug=True)
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
import math
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
import configure as c
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, ToTensorTestInput
from model.model4 import background_resnet
def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
if use_cuda:
model.cuda()
print('=> loading checkpoint')
# original saved file with DataParallel
checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
# create new OrderedDict that does not contain `module.`
model.load_state_dict(checkpoint['state_dict'])
model.eval()
return model
def split_enroll_and_test(dataroot_dir):
DB_all = read_feats_structure(dataroot_dir)
enroll_DB = pd.DataFrame()
test_DB = pd.DataFrame()
enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
# Reset the index
enroll_DB = enroll_DB.reset_index(drop=True)
test_DB = test_DB.reset_index(drop=True)
return enroll_DB, test_DB
def get_embeddings(use_cuda, filename, model, test_frames):
input, label = read_MFB(filename) # input size:(n_frames, n_dims)
tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
activation = 0
with torch.no_grad():
for i in range(tot_segments):
temp_input = input[i*test_frames:i*test_frames+test_frames]
TT = ToTensorTestInput()
temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
if use_cuda:
temp_input = temp_input.cuda()
temp_activation,_ = model(temp_input)
activation += torch.sum(temp_activation, dim=0, keepdim=True)
activation = l2_norm(activation, 1)
return activation
def l2_norm(input, alpha):
input_size = input.size() # size:(n_frames, dim)
buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
norm = torch.sqrt(normp) # size:(n_frames)
_output = torch.div(input, norm.view(-1, 1).expand_as(input))
output = _output.view(input_size)
# Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
output = output * alpha
return output
def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
"""
Output the averaged d-vector for each speaker (enrollment)
Return the dictionary (length of n_spk)
"""
n_files = len(DB) # 10
enroll_speaker_list = sorted(set(DB['speaker_id']))
embeddings = {}
# Aggregates all the activations
print("Start to aggregate all the d-vectors per enroll speaker")
for i in range(n_files):
filename = DB['filename'][i]
spk = DB['speaker_id'][i]
activation = get_embeddings(use_cuda, filename, model, test_frames)
if spk in embeddings:
embeddings[spk] += activation
else:
embeddings[spk] = activation
print("Aggregates the activation (spk : %s)" % (spk))
if not os.path.exists(embedding_dir):
os.makedirs(embedding_dir)
# Save the embeddings
for spk_index in enroll_speaker_list:
embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
torch.save(embeddings[spk_index], embedding_path)
print("Save the embeddings for %s" % (spk_index))
return embeddings
def main():
# Settings
use_cuda = True
log_dir = 'new_model4_merge'
embedding_size = 128
cp_num = 50 # Which checkpoint to use?
n_classes = 348
test_frames = 200
# Load model from checkpoint
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
# Get the dataframe for enroll DB
enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
# Where to save embeddings
embedding_dir = 'enroll_embeddings4_merge'
# Perform the enrollment and save the results
enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
""" Test speaker list
'103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
"""
if __name__ == '__main__':
main()
import librosa
import numpy as np
from python_speech_features import fbank
import pickle
sample_rate=16000
#filename='./sunghwan/8sec2.wav'
def normalize_frames(m,Scale=True):
if Scale:
return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12)
else:
return (m - np.mean(m, axis=0))
def extract(filename,savename='test.p'):
audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=40, winlen=0.025)
filter_banks = 20 * np.log10(np.maximum(filter_banks,1e-5))
feature = normalize_frames(filter_banks, Scale=False)
label = savename.split('.')[0]
todump = {'feat': feature, 'label': label}
with open(savename,'wb') as f:
pickle.dump(todump,f)
import librosa
import numpy as np
from python_speech_features import fbank
import pickle
sample_rate = 16000
#filename='./sunghwan/8sec2.wav'
def normalize_frames(m, Scale=True):
if Scale:
return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12)
else:
return (m - np.mean(m, axis=0))
def extract(filename, label='test.p'):
audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
filter_banks, energies = fbank(audio,
samplerate=sample_rate,
nfilt=40,
winlen=0.025)
filter_banks = 20 * np.log10(np.maximum(filter_banks, 1e-5))
feature = normalize_frames(filter_banks, Scale=False)
savename = 'test.p'
if label != savename:
savename='enroll.p'
todump = {'feat': feature, 'label': label}
with open(savename, 'wb') as f:
pickle.dump(todump, f)
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
import math
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
import configure as c
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, ToTensorTestInput
from model.model4 import background_resnet
def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
if use_cuda:
model.cuda()
print('=> loading checkpoint')
# original saved file with DataParallel
checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
# create new OrderedDict that does not contain `module.`
model.load_state_dict(checkpoint['state_dict'])
model.eval()
return model
def split_enroll_and_test(dataroot_dir):
DB_all = read_feats_structure(dataroot_dir)
enroll_DB = pd.DataFrame()
test_DB = pd.DataFrame()
enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
# Reset the index
enroll_DB = enroll_DB.reset_index(drop=True)
test_DB = test_DB.reset_index(drop=True)
return enroll_DB, test_DB
def load_enroll_embeddings(embedding_dir):
embeddings = {}
for f in os.listdir(embedding_dir):
spk = f.replace('.pth','')
# Select the speakers who are in the 'enroll_spk_list'
embedding_path = os.path.join(embedding_dir, f)
tmp_embeddings = torch.load(embedding_path)
embeddings[spk] = tmp_embeddings
return embeddings
def get_embeddings(use_cuda, filename, model, test_frames):
input, label = read_MFB(filename) # input size:(n_frames, n_dims)
tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
activation = 0
with torch.no_grad():
for i in range(tot_segments):
temp_input = input[i*test_frames:i*test_frames+test_frames]
TT = ToTensorTestInput()
temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
if use_cuda:
temp_input = temp_input.cuda()
temp_activation,_ = model(temp_input)
activation += torch.sum(temp_activation, dim=0, keepdim=True)
activation = l2_norm(activation, 1)
return activation
def l2_norm(input, alpha):
input_size = input.size() # size:(n_frames, dim)
buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
norm = torch.sqrt(normp) # size:(n_frames)
_output = torch.div(input, norm.view(-1, 1).expand_as(input))
output = _output.view(input_size)
# Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
output = output * alpha
return output
def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
max_score = -10**8
best_spk = None
for spk in spk_list:
score = F.cosine_similarity(test_embedding, embeddings[spk])
score = score.data.cpu().numpy()
if score > max_score:
max_score = score
best_spk = spk
#print("Speaker identification result : %s" %best_spk)
true_spk = test_filename.split('/')[-2].split('_')[0]
print("\n=== Speaker identification ===")
print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
return best_spk
def main():
log_dir = 'new_model4' # Where the checkpoints are saved
embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
# Settings
use_cuda = True # Use cuda or not
embedding_size = 128 # Dimension of speaker embeddings
cp_num = 25 # Which checkpoint to use?
n_classes = 241 # How many speakers in training data?
test_frames = 100 # Split the test utterance
# Load model from checkpoint
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
# Get the dataframe for test DB
enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
# Load enroll embeddings
embeddings = load_enroll_embeddings(embedding_dir)
""" Test speaker list
'103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
"""
spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777','sunghwan1']
# Set the test speaker
test_speaker = '207F2088'
test_path = os.path.join(test_dir, test_speaker, 'test.p')
# Perform the test
best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
if __name__ == '__main__':
main()
No preview for this file type
No preview for this file type
User-agent: *
Disallow: /
No preview for this file type
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
import math
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
import configure as c
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, ToTensorTestInput
from model.model4 import background_resnet
def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
if use_cuda:
model.cuda()
print('=> loading checkpoint')
# original saved file with DataParallel
checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
# create new OrderedDict that does not contain `module.`
model.load_state_dict(checkpoint['state_dict'])
model.eval()
return model
def split_enroll_and_test(dataroot_dir):
DB_all = read_feats_structure(dataroot_dir)
enroll_DB = pd.DataFrame()
test_DB = pd.DataFrame()
enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
# Reset the index
enroll_DB = enroll_DB.reset_index(drop=True)
test_DB = test_DB.reset_index(drop=True)
return enroll_DB, test_DB
def load_enroll_embeddings(embedding_dir):
embeddings = {}
for f in os.listdir(embedding_dir):
spk = f.replace('.pth','')
# Select the speakers who are in the 'enroll_spk_list'
embedding_path = os.path.join(embedding_dir, f)
tmp_embeddings = torch.load(embedding_path)
embeddings[spk] = tmp_embeddings
return embeddings
def get_embeddings(use_cuda, filename, model, test_frames):
input, label = read_MFB(filename) # input size:(n_frames, n_dims)
tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
activation = 0
with torch.no_grad():
for i in range(tot_segments):
temp_input = input[i*test_frames:i*test_frames+test_frames]
TT = ToTensorTestInput()
temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
if use_cuda:
temp_input = temp_input.cuda()
temp_activation,_ = model(temp_input)
activation += torch.sum(temp_activation, dim=0, keepdim=True)
activation = l2_norm(activation, 1)
return activation
def l2_norm(input, alpha):
input_size = input.size() # size:(n_frames, dim)
buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
norm = torch.sqrt(normp) # size:(n_frames)
_output = torch.div(input, norm.view(-1, 1).expand_as(input))
output = _output.view(input_size)
# Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
output = output * alpha
return output
def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
enroll_embedding = embeddings[enroll_speaker]
test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
score = F.cosine_similarity(test_embedding, enroll_embedding)
score = score.data.cpu().numpy()
if score > thres:
result = 'Accept'
else:
result = 'Reject'
test_spk = test_filename.split('/')[-2].split('_')[0]
print("\n=== Speaker verification ===")
print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
return (enroll_speaker,'%0.4f'%score)
def main():
log_dir = 'new_model4_merge' # Where the checkpoints are saved
embedding_dir = 'enroll_embeddings4_merge' # Where embeddings are saved
test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
# Settings
use_cuda = True # Use cuda or not
embedding_size = 128 # Dimension of speaker embeddings
cp_num = 50 # Which checkpoint to use?
n_classes = 348 # How many speakers in training data?
test_frames = 100 # Split the test utterance
# Load model from checkpoint
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
# Get the dataframe for test DB
enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
# Load enroll embeddings
embeddings = load_enroll_embeddings(embedding_dir)
""" Test speaker list
'103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
"""
# Set the true speaker
enroll_speaker = '213F5100'
# Set the claimed speaker
test_speaker = '207F2088'
# Threshold
thres = 0.95
test_path = os.path.join(test_dir, test_speaker, 'test.p')
# Perform the test
perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
if __name__ == '__main__':
main()