Showing
1 changed file
with
171 additions
and
0 deletions
src/collect_by_melon.py
0 → 100644
1 | +import re, pickle | ||
2 | +from selenium import webdriver | ||
3 | +from song import * | ||
4 | + | ||
5 | +WAIT_TIME = 5 | ||
6 | +YEAR = '2021년' | ||
7 | +#MONTH = ['01월', '02월'] | ||
8 | +#MONTH = ['03월', '04월'] | ||
9 | +MONTH = ['05월', '06월'] | ||
10 | +#MONTH = ['07월', '08월', '09월'] | ||
11 | +#MONTH = ['10월', '11월', '12월'] | ||
12 | + | ||
13 | +def GetMelonData(): | ||
14 | + | ||
15 | + songs = [] | ||
16 | + driver = webdriver.Chrome('chromedriver.exe') | ||
17 | + driver.implicitly_wait(WAIT_TIME) | ||
18 | + | ||
19 | + url = 'https://www.melon.com/chart/index.htm' | ||
20 | + driver.get(url) | ||
21 | + | ||
22 | + # 차트 파인더 클릭 | ||
23 | + | ||
24 | + chartFinder = driver.find_element_by_xpath("//*[@id='gnb_menu']/ul[1]/li[1]/div/div/button") | ||
25 | + chartFinder.click() | ||
26 | + driver.implicitly_wait(WAIT_TIME) | ||
27 | + | ||
28 | + # 차트선택 class='on'으로 변경 | ||
29 | + activatingSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/h3") | ||
30 | + driver.execute_script("arguments[0].setAttribute('class', 'on')", activatingSelection) | ||
31 | + # 월간차트 class='on' 변경 및 클릭 | ||
32 | + monthlyChart = driver.find_element_by_class_name("tab02") | ||
33 | + driver.execute_script("arguments[0].setAttribute('class', 'tab02 on')", monthlyChart) | ||
34 | + monthlyChart.click() | ||
35 | + driver.implicitly_wait(WAIT_TIME) | ||
36 | + | ||
37 | + activatingEraSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[1]") | ||
38 | + driver.execute_script("arguments[0].setAttribute('class', 'box_chic nth1 view on')", activatingEraSelection) | ||
39 | + eraList = driver.find_elements_by_xpath("//*[@id='d_chart_search']/div/div/div[1]/div[1]/ul/li") | ||
40 | + eraIgnore = ['1990년대', '1980년대'] | ||
41 | + isGnr2 = True | ||
42 | + | ||
43 | + # 연대 활성화 및 선택 | ||
44 | + for era in eraList: | ||
45 | + if era.text in eraIgnore: | ||
46 | + continue | ||
47 | + driver.execute_script("arguments[0].setAttribute('class', 'on')", era) | ||
48 | + era.click() | ||
49 | + #driver.execute_script("arguments[0].setAttribute('class', '')", era) | ||
50 | + driver.implicitly_wait(WAIT_TIME) | ||
51 | + | ||
52 | + #연도 활성화 및 선택 | ||
53 | + activatingYearSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[2]") | ||
54 | + driver.execute_script("arguments[0].setAttribute('class', 'box_chic nth2 view on')", activatingYearSelection) | ||
55 | + yearList = driver.find_elements_by_xpath("//*[@id='d_chart_search']/div/div/div[2]/div[1]/ul/li") | ||
56 | + for year in yearList: | ||
57 | + print(year.text) | ||
58 | + | ||
59 | + if year.text != YEAR : | ||
60 | + continue | ||
61 | + | ||
62 | + driver.execute_script("arguments[0].setAttribute('class', 'on')", year) | ||
63 | + year.click() | ||
64 | + driver.implicitly_wait(WAIT_TIME) | ||
65 | + | ||
66 | + #월간 활성화 및 선택 | ||
67 | + activatingMonthSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[3]") | ||
68 | + driver.execute_script("arguments[0].setAttribute('class', 'box_chic nth3 view on')", activatingMonthSelection) | ||
69 | + monthList = driver.find_elements_by_xpath("//*[@id='d_chart_search']/div/div/div[3]/div[1]/ul/li") | ||
70 | + | ||
71 | + for month in monthList: | ||
72 | + print(month.text) | ||
73 | + if month.text not in MONTH: | ||
74 | + continue | ||
75 | + driver.execute_script("arguments[0].setAttribute('class', 'on')", month) | ||
76 | + month.click() | ||
77 | + driver.implicitly_wait(WAIT_TIME) | ||
78 | + | ||
79 | + # 장르 활성화 및 선택 | ||
80 | + """ | ||
81 | + #1980s~2004/10: 국내종합 단일 (gnr_1) | ||
82 | + #2004/11~2016/12: 가요 선택 (gnr_2) | ||
83 | + #2017~: 국내종합 선택(gnr_2) | ||
84 | + """ | ||
85 | + activatingGenreSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[5]") | ||
86 | + driver.execute_script("arguments[0].setAttribute('class', 'box_chic last view on')", activatingGenreSelection) | ||
87 | + if isGnr2 == True: | ||
88 | + genreSelect = driver.find_element_by_id("gnr_2") | ||
89 | + if year.text == '2004년' and month.text == '11월': | ||
90 | + isGnr2 = False | ||
91 | + else: | ||
92 | + genreSelect = driver.find_element_by_id("gnr_1") | ||
93 | + driver.execute_script("arguments[0].setAttribute('class', 'on')", genreSelect) | ||
94 | + genreSelect.click() | ||
95 | + driver.implicitly_wait(WAIT_TIME) | ||
96 | + | ||
97 | + # 검색 | ||
98 | + driver.find_element_by_xpath("//*[@id='d_srch_form']/div[2]/button").click() | ||
99 | + driver.implicitly_wait(WAIT_TIME) | ||
100 | + | ||
101 | + objs = driver.find_elements_by_css_selector('#lst50 > td:nth-child(4) > div > a') | ||
102 | + # top 1 ~ 50 긁기 | ||
103 | + for item in range(0, len(objs)): | ||
104 | + song = Song() | ||
105 | + song.year = int(year.text[:4]) | ||
106 | + song.month = int(month.text[:-1]) | ||
107 | + song.rank = item + 1 | ||
108 | + href = driver.find_elements_by_css_selector('#lst50 > td:nth-child(4) > div > a')[item].get_attribute('href') | ||
109 | + number = re.findall('\d+', href)[0] | ||
110 | + driver.execute_script("window.open('https://www.melon.com/song/detail.htm?songId="+number+"');") | ||
111 | + driver.switch_to.window(driver.window_handles[1]) | ||
112 | + driver.implicitly_wait(WAIT_TIME) | ||
113 | + song.title = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.song_name')[0].text | ||
114 | + song.singer = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.artist > a > span:nth-child(1)')[0].text | ||
115 | + song.album = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(2) > a')[0].text | ||
116 | + song.genre = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(6)')[0].text | ||
117 | + song.date = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(4)')[0].text | ||
118 | + song.likes = int(driver.find_elements_by_css_selector('#d_like_count')[0].text.replace(',', '')) | ||
119 | + lyrics = driver.find_elements_by_css_selector('#d_video_summary') | ||
120 | + if len(lyrics) > 0: | ||
121 | + song.lyrics = lyrics[0].text | ||
122 | + image_url = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.thumb > a > img')[0].get_attribute('src') | ||
123 | + delete_idx = image_url.find('?') | ||
124 | + song.setImage(image_url[:delete_idx]) | ||
125 | + driver.close() | ||
126 | + driver.switch_to.window(driver.window_handles[0]) | ||
127 | + driver.implicitly_wait(WAIT_TIME) | ||
128 | + songs.append(song) | ||
129 | + print(song.rank, song.title) | ||
130 | + | ||
131 | + # 51 ~ 100 긁기 | ||
132 | + objs = driver.find_elements_by_css_selector('#lst100 > td:nth-child(4) > div > a') | ||
133 | + for item in range(0, len(objs)): | ||
134 | + song = Song() | ||
135 | + song.year = int(year.text[:4]) | ||
136 | + song.month = int(month.text[:-1]) | ||
137 | + song.rank = item + 51 | ||
138 | + href = driver.find_elements_by_css_selector('#lst100 > td:nth-child(4) > div > a')[item].get_attribute('href') | ||
139 | + number = re.findall('\d+', href)[0] | ||
140 | + driver.execute_script("window.open('https://www.melon.com/song/detail.htm?songId="+number+"');") | ||
141 | + driver.switch_to.window(driver.window_handles[1]) | ||
142 | + driver.implicitly_wait(WAIT_TIME) | ||
143 | + song.title = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.song_name')[0].text | ||
144 | + song.singer = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.artist > a > span:nth-child(1)')[0].text | ||
145 | + song.album = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(2) > a')[0].text | ||
146 | + song.genre = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(6)')[0].text | ||
147 | + song.date = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(4)')[0].text | ||
148 | + song.likes = int(driver.find_elements_by_css_selector('#d_like_count')[0].text.replace(',', '')) | ||
149 | + lyrics = driver.find_elements_by_css_selector('#d_video_summary') | ||
150 | + if len(lyrics) > 0: | ||
151 | + song.lyrics = lyrics[0].text | ||
152 | + image_url = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.thumb > a > img')[0].get_attribute('src') | ||
153 | + delete_idx = image_url.find('?') | ||
154 | + song.setImage(image_url[:delete_idx]) | ||
155 | + driver.close() | ||
156 | + driver.switch_to.window(driver.window_handles[0]) | ||
157 | + driver.implicitly_wait(WAIT_TIME) | ||
158 | + songs.append(song) | ||
159 | + print(song.rank, song.title) | ||
160 | + | ||
161 | + break | ||
162 | + return songs | ||
163 | + | ||
164 | +data = GetMelonData() | ||
165 | + | ||
166 | +with open('data.pickle', 'rb') as f: | ||
167 | + before = pickle.load(f) | ||
168 | + | ||
169 | +with open('data.pickle', 'wb') as f: | ||
170 | + pickle.dump(before + data, f) | ||
171 | + |
-
Please register or login to post a comment