2014104094

소스 코드 업로드

"""
기존의 Self Attention을 경량화한 CONVOLUTION BLOCK ATTENTION MODULE
@FUNCTION se_block : Squeeze and Excitation Block
@FUNCTION cbam_block : Convolution Block Attetntion Module
@FUNCTION channel_attention : Channel Attention
@FUNCITON Spatial_attention : Spation_attention
"""
import numpy as np
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import GlobalAvgPool2D, GlobalMaxPool2D
from tensorflow.keras.layers import Reshape, Dense, Permute, Lambda
from tensorflow.keras.layers import Add, Activation
from tensorflow.keras import backend as K
from keras.activations import sigmoid
from tensorflow.keras import layers
"""
Squeeze-and-Excitation(SE) Block
@brief : 채널간의 관계를 재종정 시켜줌
@param input_feature : tensor
"""
def se_block(input_feature, ratio=8):
se_feature = GlobalAvgPool2D()(input_feature)
channel = input_feature._shape[-1]
se_feature = Reshape((1, 1, channel))(se_feature)
se_feature = Dense(channel // ratio,
activation='relu',
kernel_initializer='he_normal',
use_bias=True,
bias_initializer='zeros')(se_feature)
se_feature = Dense(channel,
activation='sigmoid',
kernel_initializer='he_normal',
use_bias=True,
bias_initializer='zeros')(se_feature)
se_feature = layers.multiply([input_feature, se_feature])
return se_feature
"""
CBAM_BLOCK
@brief : Convolution Block Attention Module
@param cbam_feature : input tensor
@param ratio(int) : channel reduce ratio
@return cbam_feature : dynamic feature selection
"""
def cbam_block(cbam_feature, ratio=8):
cbam_feature = channel_attention(cbam_feature, ratio)
cbam_feature = spatial_attention(cbam_feature)
return cbam_feature
"""
Channel Attention
@brief : Channel Attention, average pool과 max pool을 사용(파라미터 양을 줄일 수 있음)
두 가지 pooled feature는 같은 의미를 공유하는 값이기 때문에 하나의 공유된 MLP를 사용
@param input_feature = input_tensor
@return cbam_feature
"""
def channel_attention(input_feature, ratio=8):
# 채널을 먼저 적용
channel = input_feature._shape[-1]
shared_layer_one = Dense(channel//ratio,
activation='relu',
kernel_initializer='he_normal',
use_bias=True,
bias_initializer='zeros')
shared_layer_two = Dense(channel,
kernel_initializer='he_normal',
use_bias=True,
bias_initializer='zeros')
# average pool과 max pool 두 가지를 결합하여 사용
avg_pool = GlobalAvgPool2D()(input_feature)
avg_pool = Reshape((1, 1, channel))(avg_pool)
avg_pool = shared_layer_one(avg_pool)
avg_pool = shared_layer_two(avg_pool)
max_pool = GlobalMaxPool2D()(input_feature)
max_pool = Reshape((1, 1, channel))(max_pool)
max_pool = shared_layer_one(max_pool)
max_pool = shared_layer_two(max_pool)
cbam_feature = Add()([avg_pool, max_pool])
# 가장 중요한 feature를 찾는 것이 목적이 아니기 때문에 mutually exclusive한
# softmax 대신 sigmoid를 사용
cbam_feature = Activation('sigmoid')(cbam_feature)
cbam_feature = layers.multiply([avg_pool, max_pool])
return cbam_feature
"""
Spatial Attention
@brief : 2차원의 spatial attention, single convolution을 사용하여 특징이 보이는
channel을 만듬, 정보가 어디에 있는지 중점을 둠
@param ipnut_feature : input_tensor(Channel-refined feature)
"""
def spatial_attention(input_feature, kernel_size=7):
cbam_feature = input_feature
avg_pool = Lambda(lambda x : K.mean(x, axis=3, keepdims=True))(cbam_feature)
max_pool = Lambda(lambda x : K.max(x, axis=3, keepdims=True))(cbam_feature)
concat = layers.concatenate([avg_pool, max_pool])
cbam_feature = Conv2D(filters=1,
kernel_size=kernel_size,
strides=1,
padding='same',
activation='sigmoid',
kernel_initializer='he_normal',
use_bias=False)(concat)
cbam_feature = Conv2D(filters=1,
kernel_size=kernel_size,
strides=1,
padding='same',
activation='sigmoid',
kernel_initializer='he_normal')(concat)
return layers.multiply([input_feature, cbam_feature])
\ No newline at end of file
"""
ATTENTION을 적용한 MOBILE NET
@FUNCTION load_data : pickle 데이터를 로딩하는 함수
@FUNCTION Mobile_net : 모바일 넷 모델 함수
@FUNCITON predict : 모델 예측하는 함수
"""
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, Conv3D, DepthwiseConv2D, SeparableConv2D, Conv3DTranspose
from tensorflow.keras.layers import Flatten, MaxPool2D, AvgPool2D, GlobalAvgPool2D, UpSampling2D, BatchNormalization
from tensorflow.keras.layers import Concatenate, Add, Dropout, ReLU, Lambda, Activation, LeakyReLU, PReLU
from attention_module import cbam_block
"""
데이터 로드
@brief : load Inputs and Targets from pickle data
@param data_path(str) : path to pickle file containing data
@return X(ndarray) : Inputs
@return y(ndarray) : Targets
"""
def load_data():
X = pickle.load(open("X_9.pickle", "rb"))
y = pickle.load(open("y_9.pickle", "rb"))
X = X/225.0
return X, y
"""
mobile net 구현
@brief : Mobile net with Convolution Block Attention Module(CBAM)
I used cbam at last of convolution when I used it at every conv block,
the result was worse.
@return model : Mobile Net Model
"""
def mobile_net(input_shape):
def mobile_net_block(x, f, s=1):
x = DepthwiseConv2D(3, strides=s, padding='same')(x)
x = BatchNormalization()(x)
x = ReLU()(x)
x = Conv2D(f, 1, strides=1, padding='same')(x)
x = BatchNormalization()(x)
x = ReLU()(x)
return x
input = Input(input_shape)
x = Conv2D(32, 3, strides=2, padding='same')(input)
x = BatchNormalization()(x)
x = ReLU()(x)
x = mobile_net_block(x, 64)
x = mobile_net_block(x, 128, 2)
x = mobile_net_block(x, 128)
x = mobile_net_block(x, 256, 2)
x = mobile_net_block(x, 256)
x = mobile_net_block(x, 512, 2)
for _ in range(5):
x = mobile_net_block(x, 512)
x = mobile_net_block(x, 1024, 2)
x = mobile_net_block(x, 1024)
x = cbam_block(x)
x = GlobalAvgPool2D()(x)
output = Dense(4, activation='softmax')(x)
model = Model(input, output)
return model
"""
학습된 모델로 예측
@brief : predict data from trained mobile net model
@param model : Trained classifier
@param X : Input data
@param y(int): Target
"""
def predict(model, X, y):
# sample의 입력 데이터에 차원 추가
X = X[np.newaxis, ...]
prediction = model.predict(X)
# argmax를 사용해서 index의 최대 값을 얻음
predicted_index = np.argmax(prediction, axis=1)
print("Target: {}, Predicted label: {}".format(y, predicted_index))
# 메인 함수
if __name__ == "__main__":
# load data and split to X_train and y_train
X_train, y_train = load_data()
# create network
K.clear_session()
input_shape = (X_train.shape[1], X_train.shape[2], 3)
model= mobile_net(input_shape)
# compile model
optimiser = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimiser,
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.summary()
# train model
hist = model.fit(X_train, y_train, validation_split=0.1, batch_size=32, epochs=30)
fig, loss_ax = plt.subplots()
acc_ax = loss_ax.twinx()
loss_ax.plot(hist.history['loss'], 'y', label='train loss')
loss_ax.plot(hist.history['val_loss'], label='validation loss')
acc_ax.plot(hist.history['acc'], 'b', label='train_acc')
acc_ax.plot(hist.history['val_acc'], 'g', label='validation_acc')
loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
acc_ax.set_ylabel('accuracy')
loss_ax.legend(loc='upper left')
loss_ax.legend(loc='lower left')
# evalute model
test_loss, test_acc = model.evaluate(X_train, y_train, verbose=2)
print('\nTest accuracy:', test_acc)
X_to_predict = X_train[100]
y_to_predict = y_train[100]
# predict sample
predict(model, X_to_predict, y_to_predict)
model.save('C:/Users/nokh9/Desktop/mobile_net2.h5')
\ No newline at end of file
"""
부족한 데이터를 증강
frequncy를 나타내는 mel-spectrogram에서 데이터 증강(뒤틀림, 뒤집기)을 사용하면
학습이 더 안될 것 같아서 실제로 사용은 안 함
@FUNCTION data_augumentation : 데이터를 증강하는 함수
@FUNCTION save_into_folder : 증강된 데이터들을 이름에 맞게 폴더별로 정리하는 함
"""
import os
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
"""
Data Augmentation
@brief : 모델이 적은 이미지에서 최대한 많은 정보를 뽑아내서 학습할 수 있도록
데이터를 증강시킴
@param dataset_path(str) : dog_sounds converted into mel_spectrogram
"""
def data_augumentation(dataset_path):
datagen = ImageDataGenerator(
rotation_range=40, # 이미지 회전 범위(degree)
width_shift_range=0.2, # 이미지를수 수평 또는 수직으로 랜덤하게 평행 이동
height_shift_range=0.2,
rescale=1./225, # 0-1범위로 변환
shear_range=0.2, # 임의 전단 변환(shearing transformation) 범위
zoom_range=0.2, # 임의 확대/축소 범위
horizontal_flip=False, # 소리에 대한 이미지이기 때문에 뒤집지 않음
fill_mode='nearest') # 이미지를 회전 이동하거나 축소할 때 생기는 공간을 채움
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
if dirpath is not dataset_path:
for f in filenames:
file_path = os.path.join(dirpath, f)
img = load_img(file_path) # PIL 이미지
i = 0
x = img_to_array(img) # (x, x, 3) 크기의 Numpy 배열
x=x.reshape((1,)+x.shape) # (1, x, x, 3) 크기의 Numpy 배열
# 임의 변환된 이미지를 배치 단위로 생성해서
# 이미지 20장을 생성하고 마침
for batch in datagen.flow(x, batch_size=1, save_to_dir="C:/Users/nokh9/Desktop/dog_sound_train", save_prefix=f, save_format='.jpg'):
print("{}, data_augument:{}".format(file_path, i))
i += 1
if i > 10:
break
"""
split categories
@brief 증강된 이미지를 카테고리 별로 나누어서 저장
@param dataset_path(str) : dog_sound argumented
"""
def save_into_folder(dataset_path):
num = 1
for filename in os.listdir(dataset_path):
find_category = filename.split('_')[0]
des_path = os.path.join(dataset_path, find_category)
if not os.path.exists(des_path):
os.makedirs(des_path)
num = 1
os.rename(dataset_path+filename, des_path + '/' + find_category + str(num) + '.jpg')
num += 1
# 메인 함수
if __name__ == "__main__":
data_augumentation("C:/Users/nokh9/Desktop/dog_sound_mel")
save_into_folder("C:/Users/nokh9/Desktop/dog_sound_train/")
\ No newline at end of file
"""
Mel-Sepctrogram 이미지를 학습에 쓰이기 전에 전처리
@FUNCTION create_training_data : 이미지를 전처리하는 함수
"""
import os
import random
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pickle
dataset_path = "C:/Users/nokh9/Desktop/dog_sound_mel/" # 데이터가 있는 경로
CATEGORIES = ["angry", "happy", "lonely", "sad"] # 감정에 대한 카테고리
training_data = []
"""
이미지를 전처리
@brief : preprocessing image for training
"""
def create_training_data():
for category in CATEGORIES:
path = os.path.join(dataset_path, category)
class_num = CATEGORIES.index(category)
print('-'*50)
print(category + " is started...")
print('-'*50)
image_list = os.listdir(path)
listdir_num = len(image_list)
# 데이터가 있는 폴더에서 모든 이미지들을 전처리함
for number in range(listdir_num):
try:
image_path = path + '/' + category + '_' + str(number+1) + '.jpg'
image_array = cv2.imread(image_path, cv2.IMREAD_COLOR)
new_array = cv2.resize(image_array, (62, 78))
training_data.append([new_array, class_num])
if((number+1)%100 == 0):
print(str(number+1) + 'is finished')
except Exception as e:
pass
# 메인 함수
if __name__ == "__main__":
create_training_data()
print(len(training_data))
# 데이터를 training에 쓰일 데이터와 validation에 쓰일 데이터를 나눔
random.shuffle(training_data)
X = []
y = []
for features, label in training_data:
X.append(features)
y.append(label)
X = np.array(X).reshape(-1, 62, 78, 3)
# pickle 데이터로 x와 y를 따로 저장해줌
pickle_out = open("X_9.pickle", "wb")
pickle.dump(X, pickle_out)
pickle_out.close()
pickle_out = open("y_9.pickle", "wb")
pickle.dump(y, pickle_out)
pickle_out.close()
\ No newline at end of file
"""
오디오 파일(.wav)을 다루는 예시 코드
librosa 라이브러리가 알아서 특징을 잡아주고 plot해주기 때문에
소리를 데이터화 할 때 편함
"""
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt
FIG_SIZE = (15,10)
file = "C:/Users/nokh9/Desktop/dog_sound/barking/barking_1.wav"
#
signal, sample_rate = librosa.load(file, sr=22500)
# WAVEFORM
# display waveform
plt.figure(figsize=FIG_SIZE)
librosa.display.waveplot(signal, sample_rate, alpha=0.4)
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.title("Waveform")
# FFT -> power spectrum (파워 스펙트럼)
# 퓨리에 변환
fft = np.fft.fft(signal)
# 복소수의 abs값을 게산하여 크기를 얻음
spectrum = np.abs(fft)
# create frequency variable
f = np.linspace(0, sample_rate, len(spectrum))
# take half of the spectrum and frequency
left_spectrum = spectrum[:int(len(spectrum)/2)]
left_f = f[:int(len(spectrum)/2)]
# plot spectrum
plt.figure(figsize=FIG_SIZE)
plt.plot(left_f, left_spectrum, alpha=0.4)
plt.xlabel("Frequency")
plt.ylabel("Magnitude")
plt.title("Power spectrum")
# STFT -> spectrogram
hop_length = 512 # in num. of samples
n_fft = 2048 # window in num. of samples
hop_length_duration = float(hop_length)/sample_rate
n_fft_duration = float(n_fft)/sample_rate
print("STFT hop length duration is: {}s".format(hop_length_duration))
print("STFT window duration is: {}s".format(n_fft_duration))
# perform stft
stft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
spectrogram = np.abs(stft)
# 스펙트로그램을 보여줌
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time") # x축 : 시간
plt.ylabel("Frequency") # y축 : 주파수
plt.colorbar() # 스펙트로그램을 색으로 나타내어 시각적으로 확연하게 보일 수 있게함
plt.title("Spectrogram")
# apply logarithm to cast amplitude to Decibels
log_spectrogram = librosa.amplitude_to_db(spectrogram)
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(log_spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format="%+2.0f dB")
plt.title("Spectrogram (dB)")
# MFCCs
# extract 13 MFCCs
MFCCs = librosa.feature.mfcc(signal, sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)
# display MFCCs
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(MFCCs, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("MFCC coefficients")
plt.colorbar()
plt.title("MFCCs")
# show plots
plt.show()
This file is too large to display.
import cv2
import numpy as np
import sys
import os
import pyaudio
import librosa
import librosa.display
import matplotlib.pyplot as plt
import time
import tensorflow as tf
import tensorflow.keras as keras
rate = 22500
chunk_size = rate // 4
CATEGORIES = ["angry", "happy", "lonely", "sad"]
model= keras.models.load_model("C:/Users/nokh9/Desktop/mobile_net2.h5")
def prepare(mel):
img_array = cv2.imread(mel)
new_array = cv2.resize(img_array, (62, 78), 3)
return new_array.reshape(-1, 62, 78, 3)
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paFloat32,
channels=1,
rate=rate,
input=True,
input_device_index=1,
frames_per_buffer=chunk_size)
frames = []
do_melspec = librosa.feature.melspectrogram
pwr_to_db = librosa.core.power_to_db
"""
while True:
start = time.time()
data = stream.read(chunk_size)
data = np.fromstring(data, dtype=np.float32)
melspec = do_melspec(y=data, sr=rate, n_mels=128, fmax=4000)
norm_melspec = pwr_to_db(melspec, ref=np.max)
frames.append(norm_melspec)
if len(frames) == 20:
stack = np.hstack(frames)
plt.figure(figsize=(5, 4))
librosa.display.specshow(stack,fmax=4000)
plt.savefig('C:/Users/nokh9/Desktop/DB/' + 'db.jpg')
prediction = model.predict([prepare(r'C:/Users/nokh9/Desktop/DB/db.jpg')])
print(CATEGORIES[int(prediction[0][0])])
plt.draw()
plt.pause(0.0001)
plt.clf()
#break
frames.pop(0)
t = time.time() - start
print(1 / t)
"""
prediction = model.predict([prepare(r'C:/Users/nokh9/Desktop/DB/lonely_1.jpg')])
print(prediction)
\ No newline at end of file
This file is too large to display.
No preview for this file type