GyuhoLee

[Add] data collection code

1 +import re, pickle
2 +from selenium import webdriver
3 +from song import *
4 +
5 +WAIT_TIME = 5
6 +YEAR = '2021년'
7 +#MONTH = ['01월', '02월']
8 +#MONTH = ['03월', '04월']
9 +MONTH = ['05월', '06월']
10 +#MONTH = ['07월', '08월', '09월']
11 +#MONTH = ['10월', '11월', '12월']
12 +
13 +def GetMelonData():
14 +
15 + songs = []
16 + driver = webdriver.Chrome('chromedriver.exe')
17 + driver.implicitly_wait(WAIT_TIME)
18 +
19 + url = 'https://www.melon.com/chart/index.htm'
20 + driver.get(url)
21 +
22 + # 차트 파인더 클릭
23 +
24 + chartFinder = driver.find_element_by_xpath("//*[@id='gnb_menu']/ul[1]/li[1]/div/div/button")
25 + chartFinder.click()
26 + driver.implicitly_wait(WAIT_TIME)
27 +
28 + # 차트선택 class='on'으로 변경
29 + activatingSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/h3")
30 + driver.execute_script("arguments[0].setAttribute('class', 'on')", activatingSelection)
31 + # 월간차트 class='on' 변경 및 클릭
32 + monthlyChart = driver.find_element_by_class_name("tab02")
33 + driver.execute_script("arguments[0].setAttribute('class', 'tab02 on')", monthlyChart)
34 + monthlyChart.click()
35 + driver.implicitly_wait(WAIT_TIME)
36 +
37 + activatingEraSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[1]")
38 + driver.execute_script("arguments[0].setAttribute('class', 'box_chic nth1 view on')", activatingEraSelection)
39 + eraList = driver.find_elements_by_xpath("//*[@id='d_chart_search']/div/div/div[1]/div[1]/ul/li")
40 + eraIgnore = ['1990년대', '1980년대']
41 + isGnr2 = True
42 +
43 + # 연대 활성화 및 선택
44 + for era in eraList:
45 + if era.text in eraIgnore:
46 + continue
47 + driver.execute_script("arguments[0].setAttribute('class', 'on')", era)
48 + era.click()
49 + #driver.execute_script("arguments[0].setAttribute('class', '')", era)
50 + driver.implicitly_wait(WAIT_TIME)
51 +
52 + #연도 활성화 및 선택
53 + activatingYearSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[2]")
54 + driver.execute_script("arguments[0].setAttribute('class', 'box_chic nth2 view on')", activatingYearSelection)
55 + yearList = driver.find_elements_by_xpath("//*[@id='d_chart_search']/div/div/div[2]/div[1]/ul/li")
56 + for year in yearList:
57 + print(year.text)
58 +
59 + if year.text != YEAR :
60 + continue
61 +
62 + driver.execute_script("arguments[0].setAttribute('class', 'on')", year)
63 + year.click()
64 + driver.implicitly_wait(WAIT_TIME)
65 +
66 + #월간 활성화 및 선택
67 + activatingMonthSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[3]")
68 + driver.execute_script("arguments[0].setAttribute('class', 'box_chic nth3 view on')", activatingMonthSelection)
69 + monthList = driver.find_elements_by_xpath("//*[@id='d_chart_search']/div/div/div[3]/div[1]/ul/li")
70 +
71 + for month in monthList:
72 + print(month.text)
73 + if month.text not in MONTH:
74 + continue
75 + driver.execute_script("arguments[0].setAttribute('class', 'on')", month)
76 + month.click()
77 + driver.implicitly_wait(WAIT_TIME)
78 +
79 + # 장르 활성화 및 선택
80 + """
81 + #1980s~2004/10: 국내종합 단일 (gnr_1)
82 + #2004/11~2016/12: 가요 선택 (gnr_2)
83 + #2017~: 국내종합 선택(gnr_2)
84 + """
85 + activatingGenreSelection = driver.find_element_by_xpath("//*[@id='d_chart_search']/div/div/div[5]")
86 + driver.execute_script("arguments[0].setAttribute('class', 'box_chic last view on')", activatingGenreSelection)
87 + if isGnr2 == True:
88 + genreSelect = driver.find_element_by_id("gnr_2")
89 + if year.text == '2004년' and month.text == '11월':
90 + isGnr2 = False
91 + else:
92 + genreSelect = driver.find_element_by_id("gnr_1")
93 + driver.execute_script("arguments[0].setAttribute('class', 'on')", genreSelect)
94 + genreSelect.click()
95 + driver.implicitly_wait(WAIT_TIME)
96 +
97 + # 검색
98 + driver.find_element_by_xpath("//*[@id='d_srch_form']/div[2]/button").click()
99 + driver.implicitly_wait(WAIT_TIME)
100 +
101 + objs = driver.find_elements_by_css_selector('#lst50 > td:nth-child(4) > div > a')
102 + # top 1 ~ 50 긁기
103 + for item in range(0, len(objs)):
104 + song = Song()
105 + song.year = int(year.text[:4])
106 + song.month = int(month.text[:-1])
107 + song.rank = item + 1
108 + href = driver.find_elements_by_css_selector('#lst50 > td:nth-child(4) > div > a')[item].get_attribute('href')
109 + number = re.findall('\d+', href)[0]
110 + driver.execute_script("window.open('https://www.melon.com/song/detail.htm?songId="+number+"');")
111 + driver.switch_to.window(driver.window_handles[1])
112 + driver.implicitly_wait(WAIT_TIME)
113 + song.title = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.song_name')[0].text
114 + song.singer = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.artist > a > span:nth-child(1)')[0].text
115 + song.album = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(2) > a')[0].text
116 + song.genre = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(6)')[0].text
117 + song.date = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(4)')[0].text
118 + song.likes = int(driver.find_elements_by_css_selector('#d_like_count')[0].text.replace(',', ''))
119 + lyrics = driver.find_elements_by_css_selector('#d_video_summary')
120 + if len(lyrics) > 0:
121 + song.lyrics = lyrics[0].text
122 + image_url = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.thumb > a > img')[0].get_attribute('src')
123 + delete_idx = image_url.find('?')
124 + song.setImage(image_url[:delete_idx])
125 + driver.close()
126 + driver.switch_to.window(driver.window_handles[0])
127 + driver.implicitly_wait(WAIT_TIME)
128 + songs.append(song)
129 + print(song.rank, song.title)
130 +
131 + # 51 ~ 100 긁기
132 + objs = driver.find_elements_by_css_selector('#lst100 > td:nth-child(4) > div > a')
133 + for item in range(0, len(objs)):
134 + song = Song()
135 + song.year = int(year.text[:4])
136 + song.month = int(month.text[:-1])
137 + song.rank = item + 51
138 + href = driver.find_elements_by_css_selector('#lst100 > td:nth-child(4) > div > a')[item].get_attribute('href')
139 + number = re.findall('\d+', href)[0]
140 + driver.execute_script("window.open('https://www.melon.com/song/detail.htm?songId="+number+"');")
141 + driver.switch_to.window(driver.window_handles[1])
142 + driver.implicitly_wait(WAIT_TIME)
143 + song.title = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.song_name')[0].text
144 + song.singer = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.info > div.artist > a > span:nth-child(1)')[0].text
145 + song.album = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(2) > a')[0].text
146 + song.genre = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(6)')[0].text
147 + song.date = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.entry > div.meta > dl > dd:nth-child(4)')[0].text
148 + song.likes = int(driver.find_elements_by_css_selector('#d_like_count')[0].text.replace(',', ''))
149 + lyrics = driver.find_elements_by_css_selector('#d_video_summary')
150 + if len(lyrics) > 0:
151 + song.lyrics = lyrics[0].text
152 + image_url = driver.find_elements_by_css_selector('#downloadfrm > div > div > div.thumb > a > img')[0].get_attribute('src')
153 + delete_idx = image_url.find('?')
154 + song.setImage(image_url[:delete_idx])
155 + driver.close()
156 + driver.switch_to.window(driver.window_handles[0])
157 + driver.implicitly_wait(WAIT_TIME)
158 + songs.append(song)
159 + print(song.rank, song.title)
160 +
161 + break
162 + return songs
163 +
164 +data = GetMelonData()
165 +
166 +with open('data.pickle', 'rb') as f:
167 + before = pickle.load(f)
168 +
169 +with open('data.pickle', 'wb') as f:
170 + pickle.dump(before + data, f)
171 +