Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2021-1-capstone-design2
/
2015104194
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
GyuhoLee
2021-05-25 19:15:13 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
d102e871efef7ea89a09b58b0bfed62e7745a9a3
d102e871
1 parent
420d0816
[Add] data collection code
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
171 additions
and
0 deletions
src/collect_by_melon.py
src/collect_by_melon.py
0 → 100644
View file @
d102e87
import
re
,
pickle
from
selenium
import
webdriver
from
song
import
*
WAIT_TIME
=
5
YEAR
=
'2021년'
#MONTH = ['01월', '02월']
#MONTH = ['03월', '04월']
MONTH
=
[
'05월'
,
'06월'
]
#MONTH = ['07월', '08월', '09월']
#MONTH = ['10월', '11월', '12월']
def
GetMelonData
():
songs
=
[]
driver
=
webdriver
.
Chrome
(
'chromedriver.exe'
)
driver
.
implicitly_wait
(
WAIT_TIME
)
url
=
'https://www.melon.com/chart/index.htm'
driver
.
get
(
url
)
# 차트 파인더 클릭
chartFinder
=
driver
.
find_element_by_xpath
(
"//*[@id='gnb_menu']/ul[1]/li[1]/div/div/button"
)
chartFinder
.
click
()
driver
.
implicitly_wait
(
WAIT_TIME
)
# 차트선택 class='on'으로 변경
activatingSelection
=
driver
.
find_element_by_xpath
(
"//*[@id='d_chart_search']/h3"
)
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'on')"
,
activatingSelection
)
# 월간차트 class='on' 변경 및 클릭
monthlyChart
=
driver
.
find_element_by_class_name
(
"tab02"
)
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'tab02 on')"
,
monthlyChart
)
monthlyChart
.
click
()
driver
.
implicitly_wait
(
WAIT_TIME
)
activatingEraSelection
=
driver
.
find_element_by_xpath
(
"//*[@id='d_chart_search']/div/div/div[1]"
)
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'box_chic nth1 view on')"
,
activatingEraSelection
)
eraList
=
driver
.
find_elements_by_xpath
(
"//*[@id='d_chart_search']/div/div/div[1]/div[1]/ul/li"
)
eraIgnore
=
[
'1990년대'
,
'1980년대'
]
isGnr2
=
True
# 연대 활성화 및 선택
for
era
in
eraList
:
if
era
.
text
in
eraIgnore
:
continue
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'on')"
,
era
)
era
.
click
()
#driver.execute_script("arguments[0].setAttribute('class', '')", era)
driver
.
implicitly_wait
(
WAIT_TIME
)
#연도 활성화 및 선택
activatingYearSelection
=
driver
.
find_element_by_xpath
(
"//*[@id='d_chart_search']/div/div/div[2]"
)
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'box_chic nth2 view on')"
,
activatingYearSelection
)
yearList
=
driver
.
find_elements_by_xpath
(
"//*[@id='d_chart_search']/div/div/div[2]/div[1]/ul/li"
)
for
year
in
yearList
:
print
(
year
.
text
)
if
year
.
text
!=
YEAR
:
continue
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'on')"
,
year
)
year
.
click
()
driver
.
implicitly_wait
(
WAIT_TIME
)
#월간 활성화 및 선택
activatingMonthSelection
=
driver
.
find_element_by_xpath
(
"//*[@id='d_chart_search']/div/div/div[3]"
)
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'box_chic nth3 view on')"
,
activatingMonthSelection
)
monthList
=
driver
.
find_elements_by_xpath
(
"//*[@id='d_chart_search']/div/div/div[3]/div[1]/ul/li"
)
for
month
in
monthList
:
print
(
month
.
text
)
if
month
.
text
not
in
MONTH
:
continue
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'on')"
,
month
)
month
.
click
()
driver
.
implicitly_wait
(
WAIT_TIME
)
# 장르 활성화 및 선택
"""
#1980s~2004/10: 국내종합 단일 (gnr_1)
#2004/11~2016/12: 가요 선택 (gnr_2)
#2017~: 국내종합 선택(gnr_2)
"""
activatingGenreSelection
=
driver
.
find_element_by_xpath
(
"//*[@id='d_chart_search']/div/div/div[5]"
)
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'box_chic last view on')"
,
activatingGenreSelection
)
if
isGnr2
==
True
:
genreSelect
=
driver
.
find_element_by_id
(
"gnr_2"
)
if
year
.
text
==
'2004년'
and
month
.
text
==
'11월'
:
isGnr2
=
False
else
:
genreSelect
=
driver
.
find_element_by_id
(
"gnr_1"
)
driver
.
execute_script
(
"arguments[0].setAttribute('class', 'on')"
,
genreSelect
)
genreSelect
.
click
()
driver
.
implicitly_wait
(
WAIT_TIME
)
# 검색
driver
.
find_element_by_xpath
(
"//*[@id='d_srch_form']/div[2]/button"
)
.
click
()
driver
.
implicitly_wait
(
WAIT_TIME
)
objs
=
driver
.
find_elements_by_css_selector
(
'#lst50 > td:nth-child(4) > div > a'
)
# top 1 ~ 50 긁기
for
item
in
range
(
0
,
len
(
objs
)):
song
=
Song
()
song
.
year
=
int
(
year
.
text
[:
4
])
song
.
month
=
int
(
month
.
text
[:
-
1
])
song
.
rank
=
item
+
1
href
=
driver
.
find_elements_by_css_selector
(
'#lst50 > td:nth-child(4) > div > a'
)[
item
]
.
get_attribute
(
'href'
)
number
=
re
.
findall
(
'
\
d+'
,
href
)[
0
]
driver
.
execute_script
(
"window.open('https://www.melon.com/song/detail.htm?songId="
+
number
+
"');"
)
driver
.
switch_to
.
window
(
driver
.
window_handles
[
1
])
driver
.
implicitly_wait
(
WAIT_TIME
)
song
.
title
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.info > div.song_name'
)[
0
]
.
text
song
.
singer
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.info > div.artist > a > span:nth-child(1)'
)[
0
]
.
text
song
.
album
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(2) > a'
)[
0
]
.
text
song
.
genre
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(6)'
)[
0
]
.
text
song
.
date
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(4)'
)[
0
]
.
text
song
.
likes
=
int
(
driver
.
find_elements_by_css_selector
(
'#d_like_count'
)[
0
]
.
text
.
replace
(
','
,
''
))
lyrics
=
driver
.
find_elements_by_css_selector
(
'#d_video_summary'
)
if
len
(
lyrics
)
>
0
:
song
.
lyrics
=
lyrics
[
0
]
.
text
image_url
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.thumb > a > img'
)[
0
]
.
get_attribute
(
'src'
)
delete_idx
=
image_url
.
find
(
'?'
)
song
.
setImage
(
image_url
[:
delete_idx
])
driver
.
close
()
driver
.
switch_to
.
window
(
driver
.
window_handles
[
0
])
driver
.
implicitly_wait
(
WAIT_TIME
)
songs
.
append
(
song
)
print
(
song
.
rank
,
song
.
title
)
# 51 ~ 100 긁기
objs
=
driver
.
find_elements_by_css_selector
(
'#lst100 > td:nth-child(4) > div > a'
)
for
item
in
range
(
0
,
len
(
objs
)):
song
=
Song
()
song
.
year
=
int
(
year
.
text
[:
4
])
song
.
month
=
int
(
month
.
text
[:
-
1
])
song
.
rank
=
item
+
51
href
=
driver
.
find_elements_by_css_selector
(
'#lst100 > td:nth-child(4) > div > a'
)[
item
]
.
get_attribute
(
'href'
)
number
=
re
.
findall
(
'
\
d+'
,
href
)[
0
]
driver
.
execute_script
(
"window.open('https://www.melon.com/song/detail.htm?songId="
+
number
+
"');"
)
driver
.
switch_to
.
window
(
driver
.
window_handles
[
1
])
driver
.
implicitly_wait
(
WAIT_TIME
)
song
.
title
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.info > div.song_name'
)[
0
]
.
text
song
.
singer
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.info > div.artist > a > span:nth-child(1)'
)[
0
]
.
text
song
.
album
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(2) > a'
)[
0
]
.
text
song
.
genre
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(6)'
)[
0
]
.
text
song
.
date
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(4)'
)[
0
]
.
text
song
.
likes
=
int
(
driver
.
find_elements_by_css_selector
(
'#d_like_count'
)[
0
]
.
text
.
replace
(
','
,
''
))
lyrics
=
driver
.
find_elements_by_css_selector
(
'#d_video_summary'
)
if
len
(
lyrics
)
>
0
:
song
.
lyrics
=
lyrics
[
0
]
.
text
image_url
=
driver
.
find_elements_by_css_selector
(
'#downloadfrm > div > div > div.thumb > a > img'
)[
0
]
.
get_attribute
(
'src'
)
delete_idx
=
image_url
.
find
(
'?'
)
song
.
setImage
(
image_url
[:
delete_idx
])
driver
.
close
()
driver
.
switch_to
.
window
(
driver
.
window_handles
[
0
])
driver
.
implicitly_wait
(
WAIT_TIME
)
songs
.
append
(
song
)
print
(
song
.
rank
,
song
.
title
)
break
return
songs
data
=
GetMelonData
()
with
open
(
'data.pickle'
,
'rb'
)
as
f
:
before
=
pickle
.
load
(
f
)
with
open
(
'data.pickle'
,
'wb'
)
as
f
:
pickle
.
dump
(
before
+
data
,
f
)
Please
register
or
login
to post a comment