khu

backup

This file is too large to display.
CapsNet @ 7d884474
Subproject commit 7d8844740c119ae66576be9510474a791240a745
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"from scipy.fftpack import fft\n",
"from scipy.io import wavfile # get the api"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"ename": "TypeError",
"evalue": "'numpy.int16' object is not iterable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-c176e6e452f3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mfs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwavfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'1.wav'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# load the data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# this is a two channel soundtrack, I get the first track\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mele\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m8.\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mele\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# this is 8-bit track, b is now normalized on [-1,1)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# calculate fourier transform (complex numbers list)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0md\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# you only need half of the fft list (real signal symmetry)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: 'numpy.int16' object is not iterable"
]
}
],
"source": [
"fs, data = wavfile.read('1.wav') # load the data\n",
"a = data.T[0] # this is a two channel soundtrack, I get the first track\n",
"b=[(ele/2**8.)*2-1 for ele in a] # this is 8-bit track, b is now normalized on [-1,1)\n",
"c = fft(b) # calculate fourier transform (complex numbers list)\n",
"d = int(len(c)/2) # you only need half of the fft list (real signal symmetry)\n",
"plt.plot(abs(c[:(d-1)]),'r') \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#wav파일 프린트 할줄알아야함 \n",
"#(채널 : 모노라면 1, 스테레오라면 2\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import wave\n",
"import sys\n",
"import struct\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "invalid literal for int() with base 10: '/run/user/1000/jupyter/kernel-6454a929-4509-4b51-949d-f1c910f7ce09.json'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-cc8beb2556b6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0msample_rate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetframerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtotal_num_samps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetnframes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mfft_length\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mnum_fft\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtotal_num_samps\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mfft_length\u001b[0m \u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '/run/user/1000/jupyter/kernel-6454a929-4509-4b51-949d-f1c910f7ce09.json'"
]
}
],
"source": [
"# 웨이브 파일을 열어라\n",
"fp = wave.open('birdsound.wav',\"rb\")\n",
"sample_rate = fp.getframerate()\n",
"total_num_samps = fp.getnframes()\n",
"fft_length = int(sys.argv[2])\n",
"num_fft = (total_num_samps / fft_length ) - 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 임시로 사용할 작업 배열을 만들어라\n",
"temp = zeros((num_fft,fft_length),Float)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 파일로부터 데이터를 읽어 들여라\n",
"for i in range(num_fft):\n",
" tempb = fp.readframes(fft_length);\n",
" temp[i,:] = array(struct.unpack(\"%dB\"%(fft_length), \\\n",
" tempb),Float) - 128.0\n",
"fp.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 데이터를 창틀화하라\n",
"temp = temp * hamming(fft_length)\n",
"\n",
"# FFT를 사용하여 변환하라, 파워를 반환하라\n",
"freq_pwr = 10*log10(1e-20+abs(real_fft(temp,fft_length))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 결과를 도표하라\n",
"n_out_pts = (fft_length / 2) + 1\n",
"y_axis = 0.5*float(sample_rate) / n_out_pts * \\\n",
" arange(n_out_pts)\n",
"x_axis = (total_num_samps / float(sample_rate)) / \\\n",
" num_fft * arange(num_fft)\n",
"setvar(\"X\",\"Time (sec)\")\n",
"setvar(\"Y\",\"Frequency (Hertz)\")\n",
"conshade(freq_pwr,x_axis,y_axis)\n",
"disfin()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff could not be displayed because it is too large.
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import wave\n",
"import pyaudio"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def play_file(fname):\n",
" #오디오 객체 생성\n",
" wf = wave.open('output.wav','rb') # wave파일 할당해준 객체\n",
" p = pyaudio.PyAudio() #파이오디오 할당해준 객체\n",
" chunk = 1024\n",
" \n",
" #stream = pyaudio로 open 하는것\n",
" stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),\n",
" channels=wf.getnchannels(),\n",
" rate=wf.getframerate(),\n",
" output=True)\n",
" \n",
" #데이터 wav파일에서 읽기\n",
" data = wf.readframes(chunk)\n",
" \n",
" #읽은 데이터 있는동안 \n",
" while data !='':\n",
" stream.write(data) #스트림에 데이터 쓰기 \n",
" data = wf.readframes(chunk) #데이터 wav파일에서 다시읽기\n",
" \n",
" #객체 닫아주기\n",
" stream.close()\n",
" p.terminate()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"play_file('output.wav')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed. Click to expand it.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
" import numpy as np\n",
" import pylab\n",
" import matplotlib.pyplot as plt\n",
" from scipy.io import wavfile\n",
" import time\n",
" import sys\n",
" import seaborn as sns\n",
" import pyaudio"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"i=0\n",
"f,ax = plt.subplots(2)\n",
"\n",
"# Prepare the Plotting Environment with random starting values\n",
"x = np.arange(10000)\n",
"y = np.random.randn(10000)\n",
"\n",
"# Plot 0 is for raw audio data\n",
"li, = ax[0].plot(x, y)\n",
"ax[0].set_xlim(0,1000)\n",
"ax[0].set_ylim(-5000,5000)\n",
"ax[0].set_title(\"Raw Audio Signal\")\n",
"# Plot 1 is for the FFT of the audio\n",
"li2, = ax[1].plot(x, y)\n",
"ax[1].set_xlim(0,5000)\n",
"ax[1].set_ylim(-100,100)\n",
"ax[1].set_title(\"Fast Fourier Transform\")\n",
"# Show the plot, but without blocking updates\n",
"plt.pause(0.01)\n",
"plt.tight_layout()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"FORMAT = pyaudio.paInt16 # We use 16bit format per sample\n",
"CHANNELS = 1\n",
"RATE = 44100\n",
"CHUNK = 1024 # 1024bytes of data red from a buffer\n",
"RECORD_SECONDS = 0.1\n",
"WAVE_OUTPUT_FILENAME = \"file.wav\"\n",
"\n",
"audio = pyaudio.PyAudio()\n",
"\n",
"# start Recording\n",
"stream = audio.open(format=FORMAT,\n",
" channels=CHANNELS,\n",
" rate=RATE,\n",
" input=True)#,\n",
" #frames_per_buffer=CHUNK)\n",
"\n",
"global keep_going\n",
"keep_going = True\n",
"\n",
"def plot_data(in_data):\n",
" # get and convert the data to float\n",
" audio_data = np.fromstring(in_data, np.int16)\n",
" # Fast Fourier Transform, 10*log10(abs) is to scale it to dB\n",
" # and make sure it's not imaginary\n",
" dfft = 10.*np.log10(abs(np.fft.rfft(audio_data)))\n",
"\n",
" # Force the new data into the plot, but without redrawing axes.\n",
" # If uses plt.draw(), axes are re-drawn every time\n",
" #print audio_data[0:10]\n",
" #print dfft[0:10]\n",
" #print\n",
" li.set_xdata(np.arange(len(audio_data)))\n",
" li.set_ydata(audio_data)\n",
" li2.set_xdata(np.arange(len(dfft))*10.)\n",
" li2.set_ydata(dfft)\n",
"\n",
" # Show the updated plot, but without blocking\n",
" plt.pause(0.01)\n",
" if keep_going:\n",
" return True\n",
" else:\n",
" return False\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Open the connection and start streaming the data\n",
"stream.start_stream()\n",
"print (\"\\n+---------------------------------+\")\n",
"print (\"| Press Ctrl+C to Break Recording |\")\n",
"print (\"+---------------------------------+\\n\")\n",
"\n",
"# Loop so program doesn't end while the stream callback's\n",
"# itself for new data\n",
"while keep_going:\n",
" try:\n",
" plot_data(stream.read(CHUNK))\n",
" except KeyboardInterrupt:\n",
" keep_going=False\n",
" except:\n",
" pass\n",
"\n",
"# Close up shop (currently not used because KeyboardInterrupt\n",
"# is the only way to close)\n",
"stream.stop_stream()\n",
"stream.close()\n",
"\n",
"audio.terminate()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TEST할것\n",
"<입력값>\n",
"1. 일정 데시벨파워 이하는 다 지운 멜스펙토그램 <-아... 원래 데이터셋 새소리 어느정도 작은 소리까지 잡는겨\n",
"2. mel filter 80 & axis=1평균을 뺀 멜스펙토그램\n",
"3. 새소리 주파수 이하는 주파수대는 짜름\n",
"5. 커널씩 보면서 차이나는것만 뽑아내기 (그주변만 비교하는게 있을거같은데)\n",
"6. 멜필터 안씌운거 보고 새소리 주파수 이하 짤라버릴때등.. 안씌운거 보기\n",
"\n",
"<모델>\n",
"1. 일반 뛰어난 CNN모델. 첫번째 모델이 왜 더 좋은지 보자\n",
"3. C+RNN <- RNN 부분 잘 된건가?\n",
"4. Capsul network\n",
"\n",
"<추가적>\n",
"1. 라벨링 : 확실히 귀에 잘 들리고 눈에 잘 보이는 애들만 1로 라벨링, 희미한건 0으로 라벨링 \n",
" -> 필드테스트 한 애들만 맞춘거 비율이 높도록 보이기. 머신이 새라고 한거중에 0인애들만 또 보여주기\n",
"2. 필드테스트랑 원래하던거랑 왜안될까? -> 짹짹이가 데이터셋에 별로없거나 모델이 안좋거나.. (먼저 컴한테 분류시켜보고 판단?)\n",
"3. 그경우, 찌르레기 소리에 초점을 맞춰서 저 패턴을 학습시키고 아예 그걸 찾도록 하는것도 나쁘지 않을듯\n",
"4. 아이폰녹음이랑 뭐가다른지, 실제로 차이가 난건지도 봐야함. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. CRNN - RNN 코드 다시 보고 돌리기\n",
"2. CNN - 논문1네 모델 돌리기\n",
"3. 캡슐 네트워크 돌리기\n",
"\n",
"#### scipy / librosa 둘다로 mel spectogram 짜봤는데 librosa가 더 좋았음."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This file is too large to display.
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pyaudio\n",
"import wave"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* recording\n",
"* done recording\n"
]
}
],
"source": [
"CHUNK = 1024\n",
"FORMAT = pyaudio.paInt16\n",
"CHANNELS = 1\n",
"RATE = 25600\n",
"RECORD_SECONDS = 10\n",
"WAVE_OUTPUT_FILENAME = \"test.wav\"\n",
"\n",
"p = pyaudio.PyAudio()\n",
"\n",
"stream = p.open(format=FORMAT,\n",
" channels=CHANNELS,\n",
" rate=RATE,\n",
" input=True,\n",
" frames_per_buffer=CHUNK)\n",
"\n",
"print(\"* recording\")\n",
"\n",
"frames = []\n",
"\n",
"for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
" data = stream.read(CHUNK)\n",
" frames.append(data)\n",
"\n",
"print(\"* done recording\")\n",
"\n",
"stream.stop_stream()\n",
"stream.close()\n",
"p.terminate()\n",
"\n",
"wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')\n",
"wf.setnchannels(CHANNELS)\n",
"wf.setsampwidth(p.get_sample_size(FORMAT))\n",
"wf.setframerate(RATE)\n",
"wf.writeframes(b''.join(frames))\n",
"wf.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pydub"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"done\n"
]
}
],
"source": [
"from pydub import AudioSegment\n",
"\n",
"for i in range(0,45):\n",
" t1 = i * 10000 #Works in milliseconds\n",
" t2 = (i+1) * 10000\n",
" \n",
" newAudio = AudioSegment.from_wav(\"./New/IMG_3867.wav\")\n",
" newAudio = newAudio[t1:t2]\n",
" newAudio.export('./field/IMG_3867%d.wav'%(i), format=\"wav\")\n",
"\n",
"print('done')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}