Showing
5 changed files
with
553 additions
and
0 deletions
End/Naver_crawl.py
0 → 100644
1 | +from selenium import webdriver | ||
2 | +from selenium.common import exceptions | ||
3 | +from bs4 import BeautifulSoup | ||
4 | +import time | ||
5 | +import pymysql | ||
6 | + | ||
7 | +conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8') | ||
8 | +curs = conn.cursor() | ||
9 | +def getData(url): | ||
10 | + ## chrome option | ||
11 | + options = webdriver.ChromeOptions() | ||
12 | + #options.add_argument('headless') | ||
13 | + #options.add_argument("disable-gpu") | ||
14 | + _url = url # 크롤링할 URL | ||
15 | + webDriver = "C:\\Users\\KimGun\\Desktop\\chromedriver_win32\\chromedriver.exe" # 내 웹드라이버 위치 | ||
16 | + driver = webdriver.Chrome(webDriver,chrome_options=options) | ||
17 | + #driver = webdriver.Chrome(webDriver) | ||
18 | + driver.get(_url) | ||
19 | + pageCnt = 0 | ||
20 | + driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함 | ||
21 | + try: | ||
22 | + while True: # 댓글 페이지 끝날때까지 돌림 | ||
23 | + #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지) | ||
24 | + time.sleep(0.5) | ||
25 | + driver.find_element_by_css_selector(".u_cbox_btn_more").click() | ||
26 | + pageCnt = pageCnt+1 | ||
27 | + | ||
28 | + except exceptions.ElementNotVisibleException as e: # 페이지가 끝남 | ||
29 | + pass | ||
30 | + | ||
31 | + except Exception as e: # 다른 예외 발생시 확인 | ||
32 | + print(e) | ||
33 | + | ||
34 | + pageSource = driver.page_source # 페이지 소스를 따와서 | ||
35 | + result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용 | ||
36 | + # nickname, text, time을 raw하게 뽑아온다 | ||
37 | + comments_raw = result.find_all("span", {"class" : "u_cbox_contents"}) | ||
38 | + nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"}) | ||
39 | + times_raw = result.find_all("span", {"class" : "u_cbox_date"}) | ||
40 | + | ||
41 | + | ||
42 | + | ||
43 | + # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다 | ||
44 | + comments = [comment.text for comment in comments_raw] | ||
45 | + nicknames = [nickname.text for nickname in nicknames_raw] | ||
46 | + times = [time.text for time in times_raw] | ||
47 | + | ||
48 | + | ||
49 | + naverNewsList = [] | ||
50 | + | ||
51 | + for i in range(len(comments)): | ||
52 | + info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]} | ||
53 | + naverNewsList.append(info_dic) | ||
54 | + | ||
55 | + print(naverNewsList) | ||
56 | + return naverNewsList | ||
57 | + #driver.quit() | ||
58 | + | ||
59 | +_url = input('검색하고자 하는 url을 입력해주세요: ') | ||
60 | +print('comment_list를 가져오는 중.....') | ||
61 | +cList = getData(_url) | ||
62 | +i = 194 | ||
63 | +for row in cList : ## Name, Text, time | ||
64 | + temp = row['comment'].replace("'",'') | ||
65 | + sql = "insert into naver (Naver_ID,Naver_Name,Naver_Text,Naver_Date,Naver_link) values({},'{}','{}','{}','{}')".format(i,row['userID'],temp,row['time'],_url) | ||
66 | + print(sql) | ||
67 | + i = i + 1 | ||
68 | + curs.execute(sql) | ||
69 | +conn.commit() | ||
70 | +conn.close() |
End/Twitter_Input.ipynb
0 → 100644
1 | +{ | ||
2 | + "cells": [ | ||
3 | + { | ||
4 | + "cell_type": "code", | ||
5 | + "execution_count": 6, | ||
6 | + "metadata": {}, | ||
7 | + "outputs": [ | ||
8 | + { | ||
9 | + "name": "stdout", | ||
10 | + "output_type": "stream", | ||
11 | + "text": [ | ||
12 | + "=== 기본으로 설정된 트윗 수집 기간은 2019-12-08 에서 2019-12-10 까지 입니다 ===\n", | ||
13 | + "=== 총 3일 간의 데이터 수집 ===\n", | ||
14 | + "검색할 키워드를 입력해주세요: 이동찬\n" | ||
15 | + ] | ||
16 | + }, | ||
17 | + { | ||
18 | + "data": { | ||
19 | + "application/vnd.jupyter.widget-view+json": { | ||
20 | + "model_id": "55c5a56d9ba7478f80d07518e22a3177", | ||
21 | + "version_major": 2, | ||
22 | + "version_minor": 0 | ||
23 | + }, | ||
24 | + "text/plain": [ | ||
25 | + "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))" | ||
26 | + ] | ||
27 | + }, | ||
28 | + "metadata": {}, | ||
29 | + "output_type": "display_data" | ||
30 | + }, | ||
31 | + { | ||
32 | + "name": "stdout", | ||
33 | + "output_type": "stream", | ||
34 | + "text": [ | ||
35 | + "\n", | ||
36 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(0,'r9OWHkiDE9EG3W9','I reached alamo for take my money back several times. They only keep showing me wrong information. Maybe ai works for it.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195691775451136')\n", | ||
37 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(1,'r9OWHkiDE9EG3W9','Nope. I gave up 431 dollars already. I just want ppl not to lose their money in a pleasant place.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195233975504896')\n", | ||
38 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(2,'optimum0524','IMF 환란이전 96년쯤 예금금리가 12%였던걸로 기억합니다. 80년대 코오롱그룹 이동찬 회장은 장영자에게 어음수표깡을 받으면서 50%이상의 금리를 적용받았다고하니 정말 요즘기준으로는 이해하기 힘든 시대였지요.','2019-12-10','https://twitter.com/optimum0524/status/1204190641866956801')\n", | ||
39 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(3,'Naerum10','@이동찬','2019-12-09','https://twitter.com/Naerum10/status/1203920823725121537')\n", | ||
40 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(4,'4bur0','이동찬 달려오는 폼이 너무 웃겨','2019-12-08','https://twitter.com/4bur0/status/1203736355584393216')\n", | ||
41 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(5,'r9OWHkiDE9EG3W9','And i tried to contact to the headquarters. They only says “contact the branch”. So irresponsible and irritating reaction to customers.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517328811417600')\n", | ||
42 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(6,'r9OWHkiDE9EG3W9','They told me deposit had been refunded at that time, but now i know they gave me a bullshit.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517327506993152')\n", | ||
43 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(7,'r9OWHkiDE9EG3W9','If u use cash, there wouldn’t remain any record or deposit back at all.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517326278053888')\n", | ||
44 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(8,'r9OWHkiDE9EG3W9','Ppl!! Warning!! At #1778 ala moana blvd, DO NOT use cash!!!','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517325023928320')\n", | ||
45 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(9,'r9OWHkiDE9EG3W9','@Alamo hello. Alamo and ppl who rent a car at beautiful hawaii~','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517323283324928')\n" | ||
46 | + ] | ||
47 | + } | ||
48 | + ], | ||
49 | + "source": [ | ||
50 | + "import GetOldTweets3 as got\n", | ||
51 | + "from bs4 import BeautifulSoup\n", | ||
52 | + "import pymysql\n", | ||
53 | + "import datetime\n", | ||
54 | + "import time\n", | ||
55 | + "from random import uniform\n", | ||
56 | + "from tqdm import tqdm_notebook\n", | ||
57 | + "conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')\n", | ||
58 | + "curs = conn.cursor()\n", | ||
59 | + "\n", | ||
60 | + "def get_tweets(criteria):\n", | ||
61 | + " tweet = got.manager.TweetManager.getTweets(criteria)\n", | ||
62 | + " tweet_list = []\n", | ||
63 | + "\n", | ||
64 | + " for index in tqdm_notebook(tweet):\n", | ||
65 | + "\n", | ||
66 | + " # 메타데이터 목록\n", | ||
67 | + " username = index.username\n", | ||
68 | + " link = index.permalink\n", | ||
69 | + " content = index.text\n", | ||
70 | + " tweet_date = index.date.strftime(\"%Y-%m-%d\")\n", | ||
71 | + " retweets = index.retweets\n", | ||
72 | + " favorites = index.favorites\n", | ||
73 | + "\n", | ||
74 | + " # 결과 합치기\n", | ||
75 | + " info_list = {'username' : username, 'text': content, 'time': tweet_date, 'link': link}\n", | ||
76 | + " tweet_list.append(info_list)\n", | ||
77 | + " # 휴식\n", | ||
78 | + " time.sleep(uniform(1,2))\n", | ||
79 | + " return tweet_list\n", | ||
80 | + "days_range = []\n", | ||
81 | + "\n", | ||
82 | + "start = datetime.datetime.strptime(\"2019-12-08\", \"%Y-%m-%d\")\n", | ||
83 | + "end = datetime.datetime.strptime(\"2019-12-11\", \"%Y-%m-%d\")\n", | ||
84 | + "date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]\n", | ||
85 | + "\n", | ||
86 | + "for date in date_generated:\n", | ||
87 | + " days_range.append(date.strftime(\"%Y-%m-%d\"))\n", | ||
88 | + "print(\"=== 기본으로 설정된 트윗 수집 기간은 {} 에서 {} 까지 입니다 ===\".format(days_range[0], days_range[-1]))\n", | ||
89 | + "print(\"=== 총 {}일 간의 데이터 수집 ===\".format(len(days_range)))\n", | ||
90 | + "\n", | ||
91 | + "# 수집 기간 맞추기\n", | ||
92 | + "start_date = days_range[0]\n", | ||
93 | + "end_date = (datetime.datetime.strptime(days_range[-1], \"%Y-%m-%d\")\n", | ||
94 | + " + datetime.timedelta(days=1)).strftime(\"%Y-%m-%d\") # setUntil이 끝을 포함하지 않으므로, day + 1\n", | ||
95 | + "\n", | ||
96 | + "my_key = input(\"검색할 키워드를 입력해주세요: \")\n", | ||
97 | + "\n", | ||
98 | + "tweetCriteria = got.manager.TweetCriteria().setQuerySearch(my_key)\\\n", | ||
99 | + " .setSince(\"2019-12-08\")\\\n", | ||
100 | + " .setUntil(\"2019-12-11\")\\\n", | ||
101 | + " .setMaxTweets(10)\n", | ||
102 | + "result_list = get_tweets(tweetCriteria)\n", | ||
103 | + "\n", | ||
104 | + "i = 0\n", | ||
105 | + "for row in result_list : # 이름 내용 날짜 링크\n", | ||
106 | + " sql = \"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values({},'{}','{}','{}','{}')\".format(i,row['username'],row['text'],row['time'],row['link'])\n", | ||
107 | + " print(sql)\n", | ||
108 | + " i = i + 1\n", | ||
109 | + " curs.execute(sql)\n", | ||
110 | + "conn.commit()\n", | ||
111 | + "conn.close()\n" | ||
112 | + ] | ||
113 | + }, | ||
114 | + { | ||
115 | + "cell_type": "code", | ||
116 | + "execution_count": null, | ||
117 | + "metadata": {}, | ||
118 | + "outputs": [], | ||
119 | + "source": [] | ||
120 | + } | ||
121 | + ], | ||
122 | + "metadata": { | ||
123 | + "kernelspec": { | ||
124 | + "display_name": "Python 3", | ||
125 | + "language": "python", | ||
126 | + "name": "python3" | ||
127 | + }, | ||
128 | + "language_info": { | ||
129 | + "codemirror_mode": { | ||
130 | + "name": "ipython", | ||
131 | + "version": 3 | ||
132 | + }, | ||
133 | + "file_extension": ".py", | ||
134 | + "mimetype": "text/x-python", | ||
135 | + "name": "python", | ||
136 | + "nbconvert_exporter": "python", | ||
137 | + "pygments_lexer": "ipython3", | ||
138 | + "version": "3.7.3" | ||
139 | + } | ||
140 | + }, | ||
141 | + "nbformat": 4, | ||
142 | + "nbformat_minor": 2 | ||
143 | +} |
End/downloader.py
0 → 100644
1 | +#!/usr/bin/env python | ||
2 | +from __future__ import print_function | ||
3 | +import sys | ||
4 | +import os | ||
5 | +import time | ||
6 | +import json | ||
7 | +import requests | ||
8 | +import argparse | ||
9 | +import lxml.html | ||
10 | +import io | ||
11 | +from urllib.parse import urlparse, parse_qs | ||
12 | +from lxml.cssselect import CSSSelector | ||
13 | +YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}' | ||
14 | +YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax' | ||
15 | +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' | ||
16 | +def find_value(html, key, num_chars=2): | ||
17 | + pos_begin = html.find(key) + len(key) + num_chars | ||
18 | + pos_end = html.find('"', pos_begin) | ||
19 | + return html[pos_begin: pos_end] | ||
20 | +def extract_comments(html): | ||
21 | + tree = lxml.html.fromstring(html) | ||
22 | + item_sel = CSSSelector('.comment-item') | ||
23 | + text_sel = CSSSelector('.comment-text-content') | ||
24 | + time_sel = CSSSelector('.time') | ||
25 | + author_sel = CSSSelector('.user-name') | ||
26 | + for item in item_sel(tree): | ||
27 | + yield {'cid': item.get('data-cid'), | ||
28 | + 'text': text_sel(item)[0].text_content(), | ||
29 | + 'time': time_sel(item)[0].text_content().strip(), | ||
30 | + 'author': author_sel(item)[0].text_content()} | ||
31 | +def extract_reply_cids(html): | ||
32 | + tree = lxml.html.fromstring(html) | ||
33 | + sel = CSSSelector('.comment-replies-header > .load-comments') | ||
34 | + return [i.get('data-cid') for i in sel(tree)] | ||
35 | +def ajax_request(session, url, params, data, retries=10, sleep=20): | ||
36 | + for _ in range(retries): | ||
37 | + response = session.post(url, params=params, data=data) | ||
38 | + if response.status_code == 200: | ||
39 | + response_dict = json.loads(response.text) | ||
40 | + return response_dict.get('page_token', None), response_dict['html_content'] | ||
41 | + else: | ||
42 | + time.sleep(sleep) | ||
43 | +def download_comments(youtube_id, sleep=1): | ||
44 | + session = requests.Session() | ||
45 | + session.headers['User-Agent'] = USER_AGENT | ||
46 | + # Get Youtube page with initial comments | ||
47 | + response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id)) | ||
48 | + html = response.text | ||
49 | + reply_cids = extract_reply_cids(html) | ||
50 | + ret_cids = [] | ||
51 | + for comment in extract_comments(html): | ||
52 | + ret_cids.append(comment['cid']) | ||
53 | + yield comment | ||
54 | + page_token = find_value(html, 'data-token') | ||
55 | + session_token = find_value(html, 'XSRF_TOKEN', 4) | ||
56 | + first_iteration = True | ||
57 | + # Get remaining comments (the same as pressing the 'Show more' button) | ||
58 | + while page_token: | ||
59 | + data = {'video_id': youtube_id, | ||
60 | + 'session_token': session_token} | ||
61 | + params = {'action_load_comments': 1, | ||
62 | + 'order_by_time': True, | ||
63 | + 'filter': youtube_id} | ||
64 | + if first_iteration: | ||
65 | + params['order_menu'] = True | ||
66 | + else: | ||
67 | + data['page_token'] = page_token | ||
68 | + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | ||
69 | + if not response: | ||
70 | + break | ||
71 | + page_token, html = response | ||
72 | + reply_cids += extract_reply_cids(html) | ||
73 | + for comment in extract_comments(html): | ||
74 | + if comment['cid'] not in ret_cids: | ||
75 | + ret_cids.append(comment['cid']) | ||
76 | + yield comment | ||
77 | + first_iteration = False | ||
78 | + time.sleep(sleep) | ||
79 | + # Get replies (the same as pressing the 'View all X replies' link) | ||
80 | + for cid in reply_cids: | ||
81 | + data = {'comment_id': cid, | ||
82 | + 'video_id': youtube_id, | ||
83 | + 'can_reply': 1, | ||
84 | + 'session_token': session_token} | ||
85 | + params = {'action_load_replies': 1, | ||
86 | + 'order_by_time': True, | ||
87 | + 'filter': youtube_id, | ||
88 | + 'tab': 'inbox'} | ||
89 | + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | ||
90 | + if not response: | ||
91 | + break | ||
92 | + _, html = response | ||
93 | + for comment in extract_comments(html): | ||
94 | + if comment['cid'] not in ret_cids: | ||
95 | + ret_cids.append(comment['cid']) | ||
96 | + yield comment | ||
97 | + time.sleep(sleep) | ||
98 | +## input video 값 parsing | ||
99 | +def video_id(value): | ||
100 | + query = urlparse(value) | ||
101 | + if query.hostname == 'youtu.be': | ||
102 | + return query.path[1:] | ||
103 | + if query.hostname in ('www.youtube.com', 'youtube.com'): | ||
104 | + if query.path == '/watch': | ||
105 | + p = parse_qs(query.query) | ||
106 | + return p['v'][0] | ||
107 | + if query.path[:7] == '/embed/': | ||
108 | + return query.path.split('/')[2] | ||
109 | + if query.path[:3] == '/v/': | ||
110 | + return query.path.split('/')[2] | ||
111 | + # fail? | ||
112 | + return None | ||
113 | +def main(): | ||
114 | + #parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API')) | ||
115 | + #parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit') | ||
116 | + #parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments') | ||
117 | + #parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)') | ||
118 | + #parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments') | ||
119 | + Youtube_id1 = input('Youtube_ID 입력 :') | ||
120 | + ## Cutting Link를 받고 id만 딸 수 있도록 | ||
121 | + Youtube_id2 = Youtube_id1 | ||
122 | + Youtube_id1 = video_id(Youtube_id1) | ||
123 | + youtube_id = Youtube_id1 | ||
124 | + try: | ||
125 | + # args = parser.parse_args(argv) | ||
126 | + #youtube_id = args.youtubeid | ||
127 | + #output = args.output | ||
128 | + #limit = args.limit | ||
129 | + result_List = [] | ||
130 | + ## input 값을 받고 값에 할당 | ||
131 | + ## Limit에 빈 값이 들어갈 경우 Default 값으로 100을 넣게 하였음 | ||
132 | + if not youtube_id : | ||
133 | + #parser.print_usage() | ||
134 | + #raise ValueError('you need to specify a Youtube ID and an output filename') | ||
135 | + raise ValueError('올바른 입력 값을 입력하세요') | ||
136 | + print('Downloading Youtube comments for video:', youtube_id) | ||
137 | + Number = 1 | ||
138 | + if Number == '0' : | ||
139 | + Output1 = input('결과를 받을 파일 입력 :') | ||
140 | + Limit1 = input('제한 갯수 입력 : ') | ||
141 | + if Limit1 == '' : | ||
142 | + Limit1 = 100 | ||
143 | + Limit1 = int(Limit1) | ||
144 | + limit = int(Limit1) | ||
145 | + output = Output1 | ||
146 | + ##### argument로 받지 않고 input으로 받기 위한 것 | ||
147 | + with io.open(output, 'w', encoding='utf8') as fp: | ||
148 | + for comment in download_comments(youtube_id): | ||
149 | + comment_json = json.dumps(comment, ensure_ascii=False) | ||
150 | + print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp) | ||
151 | + count += 1 | ||
152 | + sys.stdout.flush() | ||
153 | + if limit and count >= limit: | ||
154 | + print('Downloaded {} comment(s)\r'.format(count)) | ||
155 | + print('\nDone!') | ||
156 | + break | ||
157 | + else : | ||
158 | + count = 0 | ||
159 | + i = 0 | ||
160 | + limit = 100 | ||
161 | + for comment in download_comments(youtube_id): | ||
162 | + dic = {} | ||
163 | + dic['cid'] = comment['cid'] | ||
164 | + dic['text'] = str(comment['text']) | ||
165 | + dic['time'] = comment['time'] | ||
166 | + dic['author'] = comment['author'] | ||
167 | + dic['link'] = Youtube_id2 | ||
168 | + result_List.append(dic) | ||
169 | + count += 1 | ||
170 | + i += 1 | ||
171 | + if limit == count : | ||
172 | + print(' Comment Thread 생성 완료') | ||
173 | + print ('\n\n\n\n\n\n\n') | ||
174 | + break | ||
175 | + return result_List | ||
176 | + #goto_Menu(result_List) | ||
177 | + except Exception as e: | ||
178 | + print('Error:', str(e)) | ||
179 | + sys.exit(1) | ||
180 | +if __name__ == "__main__": | ||
181 | + main() |
End/schema.sql
0 → 100644
1 | +-- MySQL Workbench Forward Engineering | ||
2 | + | ||
3 | +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; | ||
4 | +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; | ||
5 | +SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; | ||
6 | + | ||
7 | +-- ----------------------------------------------------- | ||
8 | +-- Schema os_db | ||
9 | +-- ----------------------------------------------------- | ||
10 | + | ||
11 | +-- ----------------------------------------------------- | ||
12 | +-- Schema os_db | ||
13 | +-- ----------------------------------------------------- | ||
14 | +CREATE SCHEMA IF NOT EXISTS `os_db` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci ; | ||
15 | +USE `os_db` ; | ||
16 | + | ||
17 | +-- ----------------------------------------------------- | ||
18 | +-- Table `os_db`.`naver` | ||
19 | +-- ----------------------------------------------------- | ||
20 | +CREATE TABLE IF NOT EXISTS `os_db`.`naver` ( | ||
21 | + `Naver_ID` INT(11) NOT NULL, | ||
22 | + `Naver_Name` VARCHAR(6000) NULL DEFAULT NULL, | ||
23 | + `Naver_Text` VARCHAR(6000) NULL DEFAULT NULL, | ||
24 | + `Naver_Date` VARCHAR(100) NULL DEFAULT NULL, | ||
25 | + `Naver_Link` VARCHAR(1000) NULL DEFAULT NULL, | ||
26 | + PRIMARY KEY (`Naver_ID`)) | ||
27 | +ENGINE = InnoDB | ||
28 | +DEFAULT CHARACTER SET = utf8; | ||
29 | + | ||
30 | + | ||
31 | +-- ----------------------------------------------------- | ||
32 | +-- Table `os_db`.`twitter` | ||
33 | +-- ----------------------------------------------------- | ||
34 | +CREATE TABLE IF NOT EXISTS `os_db`.`twitter` ( | ||
35 | + `Twitter_ID` INT(11) NOT NULL, | ||
36 | + `Twitter_Name` VARCHAR(6000) NULL DEFAULT NULL, | ||
37 | + `Twitter_Link` VARCHAR(6000) NULL DEFAULT NULL, | ||
38 | + `Twitter_Date` VARCHAR(100) NULL DEFAULT NULL, | ||
39 | + `Twitter_Text` VARCHAR(6000) NULL DEFAULT NULL, | ||
40 | + PRIMARY KEY (`Twitter_ID`)) | ||
41 | +ENGINE = InnoDB | ||
42 | +DEFAULT CHARACTER SET = utf8; | ||
43 | + | ||
44 | + | ||
45 | +-- ----------------------------------------------------- | ||
46 | +-- Table `os_db`.`user` | ||
47 | +-- ----------------------------------------------------- | ||
48 | +CREATE TABLE IF NOT EXISTS `os_db`.`user` ( | ||
49 | + `User_ID` INT(11) NOT NULL, | ||
50 | + PRIMARY KEY (`User_ID`)) | ||
51 | +ENGINE = InnoDB | ||
52 | +DEFAULT CHARACTER SET = utf8mb4 | ||
53 | +COLLATE = utf8mb4_unicode_ci; | ||
54 | + | ||
55 | + | ||
56 | +-- ----------------------------------------------------- | ||
57 | +-- Table `os_db`.`user_naver` | ||
58 | +-- ----------------------------------------------------- | ||
59 | +CREATE TABLE IF NOT EXISTS `os_db`.`user_naver` ( | ||
60 | + `User_ID` INT(11) NOT NULL, | ||
61 | + `Naver_ID` INT(11) NOT NULL, | ||
62 | + PRIMARY KEY (`User_ID`, `Naver_ID`), | ||
63 | + INDEX `UN_Naver_idx` (`Naver_ID` ASC) , | ||
64 | + CONSTRAINT `UN_Naver` | ||
65 | + FOREIGN KEY (`Naver_ID`) | ||
66 | + REFERENCES `os_db`.`naver` (`Naver_ID`), | ||
67 | + CONSTRAINT `UN_user` | ||
68 | + FOREIGN KEY (`User_ID`) | ||
69 | + REFERENCES `os_db`.`user` (`User_ID`)) | ||
70 | +ENGINE = InnoDB | ||
71 | +DEFAULT CHARACTER SET = utf8mb4 | ||
72 | +COLLATE = utf8mb4_unicode_ci; | ||
73 | + | ||
74 | + | ||
75 | +-- ----------------------------------------------------- | ||
76 | +-- Table `os_db`.`user_twitter` | ||
77 | +-- ----------------------------------------------------- | ||
78 | +CREATE TABLE IF NOT EXISTS `os_db`.`user_twitter` ( | ||
79 | + `User_ID` INT(11) NOT NULL, | ||
80 | + `Twitter_ID` INT(11) NOT NULL, | ||
81 | + PRIMARY KEY (`User_ID`, `Twitter_ID`), | ||
82 | + INDEX `UT_twitter_idx` (`Twitter_ID` ASC) , | ||
83 | + CONSTRAINT `UT_twitter` | ||
84 | + FOREIGN KEY (`Twitter_ID`) | ||
85 | + REFERENCES `os_db`.`twitter` (`Twitter_ID`), | ||
86 | + CONSTRAINT `UT_user` | ||
87 | + FOREIGN KEY (`User_ID`) | ||
88 | + REFERENCES `os_db`.`user` (`User_ID`)) | ||
89 | +ENGINE = InnoDB | ||
90 | +DEFAULT CHARACTER SET = utf8mb4 | ||
91 | +COLLATE = utf8mb4_unicode_ci; | ||
92 | + | ||
93 | + | ||
94 | +-- ----------------------------------------------------- | ||
95 | +-- Table `os_db`.`youtube` | ||
96 | +-- ----------------------------------------------------- | ||
97 | +CREATE TABLE IF NOT EXISTS `os_db`.`youtube` ( | ||
98 | + `Youtube_ID` INT(11) NOT NULL, | ||
99 | + `Youtube_Text` VARCHAR(12000) NULL DEFAULT NULL, | ||
100 | + `Youtube_Name` VARCHAR(6000) NULL DEFAULT NULL, | ||
101 | + `Youtube_Date` VARCHAR(100) NULL DEFAULT NULL, | ||
102 | + `Youtube_Link` VARCHAR(1000) NULL DEFAULT NULL, | ||
103 | + PRIMARY KEY (`Youtube_ID`)) | ||
104 | +ENGINE = MyISAM | ||
105 | +DEFAULT CHARACTER SET = utf8; | ||
106 | + | ||
107 | + | ||
108 | +-- ----------------------------------------------------- | ||
109 | +-- Table `os_db`.`user_youtube` | ||
110 | +-- ----------------------------------------------------- | ||
111 | +CREATE TABLE IF NOT EXISTS `os_db`.`user_youtube` ( | ||
112 | + `User_ID` INT(11) NOT NULL, | ||
113 | + `Youtube_ID` INT(11) NOT NULL, | ||
114 | + PRIMARY KEY (`User_ID`, `Youtube_ID`), | ||
115 | + INDEX `UY_youtube_idx` (`Youtube_ID` ASC) , | ||
116 | + CONSTRAINT `UY_user` | ||
117 | + FOREIGN KEY (`User_ID`) | ||
118 | + REFERENCES `os_db`.`user` (`User_ID`), | ||
119 | + CONSTRAINT `UY_youtube` | ||
120 | + FOREIGN KEY (`Youtube_ID`) | ||
121 | + REFERENCES `os_db`.`youtube` (`Youtube_ID`)) | ||
122 | +ENGINE = InnoDB | ||
123 | +DEFAULT CHARACTER SET = utf8mb4 | ||
124 | +COLLATE = utf8mb4_unicode_ci; | ||
125 | + | ||
126 | + | ||
127 | +SET SQL_MODE=@OLD_SQL_MODE; | ||
128 | +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; | ||
129 | +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; |
End/youtube_crawl2.py
0 → 100644
1 | +import downloader | ||
2 | +import pymysql | ||
3 | +import csv | ||
4 | +import random | ||
5 | + | ||
6 | +conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8') | ||
7 | +curs = conn.cursor() | ||
8 | + | ||
9 | +def call_main (): | ||
10 | + print(' Comment Thread 생성중 \n') | ||
11 | + print(' **************************************************************') | ||
12 | + print(' **************************************************************') | ||
13 | + print(' **************************************************************') | ||
14 | + print(' **************** 생성 완료 정보를 입력하세요. **************** ') | ||
15 | + print(' **************************************************************') | ||
16 | + print(' **************************************************************') | ||
17 | + print(' **************************************************************') | ||
18 | + a = downloader.main() | ||
19 | + return a | ||
20 | + | ||
21 | +CommentList = call_main() ## dic 형식으로 cid, text, time, author | ||
22 | +i = 0 | ||
23 | +for row in CommentList : | ||
24 | + temp = row['text'].replace("'",'') | ||
25 | + sql = "insert into youtube (Youtube_ID,Youtube_Text,Youtube_Date,Youtube_Name,Youtube_Link) values({},'{}','{}','{}','{}')".format(i,temp,row['time'],row['author'],row['link']) | ||
26 | + print(sql) | ||
27 | + i = i + 1 | ||
28 | + curs.execute(sql) | ||
29 | +conn.commit() | ||
30 | +conn.close() |
-
Please register or login to post a comment