Showing
5 changed files
with
553 additions
and
0 deletions
End/Naver_crawl.py
0 → 100644
| 1 | +from selenium import webdriver | ||
| 2 | +from selenium.common import exceptions | ||
| 3 | +from bs4 import BeautifulSoup | ||
| 4 | +import time | ||
| 5 | +import pymysql | ||
| 6 | + | ||
| 7 | +conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8') | ||
| 8 | +curs = conn.cursor() | ||
| 9 | +def getData(url): | ||
| 10 | + ## chrome option | ||
| 11 | + options = webdriver.ChromeOptions() | ||
| 12 | + #options.add_argument('headless') | ||
| 13 | + #options.add_argument("disable-gpu") | ||
| 14 | + _url = url # 크롤링할 URL | ||
| 15 | + webDriver = "C:\\Users\\KimGun\\Desktop\\chromedriver_win32\\chromedriver.exe" # 내 웹드라이버 위치 | ||
| 16 | + driver = webdriver.Chrome(webDriver,chrome_options=options) | ||
| 17 | + #driver = webdriver.Chrome(webDriver) | ||
| 18 | + driver.get(_url) | ||
| 19 | + pageCnt = 0 | ||
| 20 | + driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함 | ||
| 21 | + try: | ||
| 22 | + while True: # 댓글 페이지 끝날때까지 돌림 | ||
| 23 | + #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지) | ||
| 24 | + time.sleep(0.5) | ||
| 25 | + driver.find_element_by_css_selector(".u_cbox_btn_more").click() | ||
| 26 | + pageCnt = pageCnt+1 | ||
| 27 | + | ||
| 28 | + except exceptions.ElementNotVisibleException as e: # 페이지가 끝남 | ||
| 29 | + pass | ||
| 30 | + | ||
| 31 | + except Exception as e: # 다른 예외 발생시 확인 | ||
| 32 | + print(e) | ||
| 33 | + | ||
| 34 | + pageSource = driver.page_source # 페이지 소스를 따와서 | ||
| 35 | + result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용 | ||
| 36 | + # nickname, text, time을 raw하게 뽑아온다 | ||
| 37 | + comments_raw = result.find_all("span", {"class" : "u_cbox_contents"}) | ||
| 38 | + nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"}) | ||
| 39 | + times_raw = result.find_all("span", {"class" : "u_cbox_date"}) | ||
| 40 | + | ||
| 41 | + | ||
| 42 | + | ||
| 43 | + # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다 | ||
| 44 | + comments = [comment.text for comment in comments_raw] | ||
| 45 | + nicknames = [nickname.text for nickname in nicknames_raw] | ||
| 46 | + times = [time.text for time in times_raw] | ||
| 47 | + | ||
| 48 | + | ||
| 49 | + naverNewsList = [] | ||
| 50 | + | ||
| 51 | + for i in range(len(comments)): | ||
| 52 | + info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]} | ||
| 53 | + naverNewsList.append(info_dic) | ||
| 54 | + | ||
| 55 | + print(naverNewsList) | ||
| 56 | + return naverNewsList | ||
| 57 | + #driver.quit() | ||
| 58 | + | ||
| 59 | +_url = input('검색하고자 하는 url을 입력해주세요: ') | ||
| 60 | +print('comment_list를 가져오는 중.....') | ||
| 61 | +cList = getData(_url) | ||
| 62 | +i = 194 | ||
| 63 | +for row in cList : ## Name, Text, time | ||
| 64 | + temp = row['comment'].replace("'",'') | ||
| 65 | + sql = "insert into naver (Naver_ID,Naver_Name,Naver_Text,Naver_Date,Naver_link) values({},'{}','{}','{}','{}')".format(i,row['userID'],temp,row['time'],_url) | ||
| 66 | + print(sql) | ||
| 67 | + i = i + 1 | ||
| 68 | + curs.execute(sql) | ||
| 69 | +conn.commit() | ||
| 70 | +conn.close() |
End/Twitter_Input.ipynb
0 → 100644
| 1 | +{ | ||
| 2 | + "cells": [ | ||
| 3 | + { | ||
| 4 | + "cell_type": "code", | ||
| 5 | + "execution_count": 6, | ||
| 6 | + "metadata": {}, | ||
| 7 | + "outputs": [ | ||
| 8 | + { | ||
| 9 | + "name": "stdout", | ||
| 10 | + "output_type": "stream", | ||
| 11 | + "text": [ | ||
| 12 | + "=== 기본으로 설정된 트윗 수집 기간은 2019-12-08 에서 2019-12-10 까지 입니다 ===\n", | ||
| 13 | + "=== 총 3일 간의 데이터 수집 ===\n", | ||
| 14 | + "검색할 키워드를 입력해주세요: 이동찬\n" | ||
| 15 | + ] | ||
| 16 | + }, | ||
| 17 | + { | ||
| 18 | + "data": { | ||
| 19 | + "application/vnd.jupyter.widget-view+json": { | ||
| 20 | + "model_id": "55c5a56d9ba7478f80d07518e22a3177", | ||
| 21 | + "version_major": 2, | ||
| 22 | + "version_minor": 0 | ||
| 23 | + }, | ||
| 24 | + "text/plain": [ | ||
| 25 | + "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))" | ||
| 26 | + ] | ||
| 27 | + }, | ||
| 28 | + "metadata": {}, | ||
| 29 | + "output_type": "display_data" | ||
| 30 | + }, | ||
| 31 | + { | ||
| 32 | + "name": "stdout", | ||
| 33 | + "output_type": "stream", | ||
| 34 | + "text": [ | ||
| 35 | + "\n", | ||
| 36 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(0,'r9OWHkiDE9EG3W9','I reached alamo for take my money back several times. They only keep showing me wrong information. Maybe ai works for it.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195691775451136')\n", | ||
| 37 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(1,'r9OWHkiDE9EG3W9','Nope. I gave up 431 dollars already. I just want ppl not to lose their money in a pleasant place.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195233975504896')\n", | ||
| 38 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(2,'optimum0524','IMF 환란이전 96년쯤 예금금리가 12%였던걸로 기억합니다. 80년대 코오롱그룹 이동찬 회장은 장영자에게 어음수표깡을 받으면서 50%이상의 금리를 적용받았다고하니 정말 요즘기준으로는 이해하기 힘든 시대였지요.','2019-12-10','https://twitter.com/optimum0524/status/1204190641866956801')\n", | ||
| 39 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(3,'Naerum10','@이동찬','2019-12-09','https://twitter.com/Naerum10/status/1203920823725121537')\n", | ||
| 40 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(4,'4bur0','이동찬 달려오는 폼이 너무 웃겨','2019-12-08','https://twitter.com/4bur0/status/1203736355584393216')\n", | ||
| 41 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(5,'r9OWHkiDE9EG3W9','And i tried to contact to the headquarters. They only says “contact the branch”. So irresponsible and irritating reaction to customers.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517328811417600')\n", | ||
| 42 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(6,'r9OWHkiDE9EG3W9','They told me deposit had been refunded at that time, but now i know they gave me a bullshit.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517327506993152')\n", | ||
| 43 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(7,'r9OWHkiDE9EG3W9','If u use cash, there wouldn’t remain any record or deposit back at all.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517326278053888')\n", | ||
| 44 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(8,'r9OWHkiDE9EG3W9','Ppl!! Warning!! At #1778 ala moana blvd, DO NOT use cash!!!','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517325023928320')\n", | ||
| 45 | + "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(9,'r9OWHkiDE9EG3W9','@Alamo hello. Alamo and ppl who rent a car at beautiful hawaii~','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517323283324928')\n" | ||
| 46 | + ] | ||
| 47 | + } | ||
| 48 | + ], | ||
| 49 | + "source": [ | ||
| 50 | + "import GetOldTweets3 as got\n", | ||
| 51 | + "from bs4 import BeautifulSoup\n", | ||
| 52 | + "import pymysql\n", | ||
| 53 | + "import datetime\n", | ||
| 54 | + "import time\n", | ||
| 55 | + "from random import uniform\n", | ||
| 56 | + "from tqdm import tqdm_notebook\n", | ||
| 57 | + "conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')\n", | ||
| 58 | + "curs = conn.cursor()\n", | ||
| 59 | + "\n", | ||
| 60 | + "def get_tweets(criteria):\n", | ||
| 61 | + " tweet = got.manager.TweetManager.getTweets(criteria)\n", | ||
| 62 | + " tweet_list = []\n", | ||
| 63 | + "\n", | ||
| 64 | + " for index in tqdm_notebook(tweet):\n", | ||
| 65 | + "\n", | ||
| 66 | + " # 메타데이터 목록\n", | ||
| 67 | + " username = index.username\n", | ||
| 68 | + " link = index.permalink\n", | ||
| 69 | + " content = index.text\n", | ||
| 70 | + " tweet_date = index.date.strftime(\"%Y-%m-%d\")\n", | ||
| 71 | + " retweets = index.retweets\n", | ||
| 72 | + " favorites = index.favorites\n", | ||
| 73 | + "\n", | ||
| 74 | + " # 결과 합치기\n", | ||
| 75 | + " info_list = {'username' : username, 'text': content, 'time': tweet_date, 'link': link}\n", | ||
| 76 | + " tweet_list.append(info_list)\n", | ||
| 77 | + " # 휴식\n", | ||
| 78 | + " time.sleep(uniform(1,2))\n", | ||
| 79 | + " return tweet_list\n", | ||
| 80 | + "days_range = []\n", | ||
| 81 | + "\n", | ||
| 82 | + "start = datetime.datetime.strptime(\"2019-12-08\", \"%Y-%m-%d\")\n", | ||
| 83 | + "end = datetime.datetime.strptime(\"2019-12-11\", \"%Y-%m-%d\")\n", | ||
| 84 | + "date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]\n", | ||
| 85 | + "\n", | ||
| 86 | + "for date in date_generated:\n", | ||
| 87 | + " days_range.append(date.strftime(\"%Y-%m-%d\"))\n", | ||
| 88 | + "print(\"=== 기본으로 설정된 트윗 수집 기간은 {} 에서 {} 까지 입니다 ===\".format(days_range[0], days_range[-1]))\n", | ||
| 89 | + "print(\"=== 총 {}일 간의 데이터 수집 ===\".format(len(days_range)))\n", | ||
| 90 | + "\n", | ||
| 91 | + "# 수집 기간 맞추기\n", | ||
| 92 | + "start_date = days_range[0]\n", | ||
| 93 | + "end_date = (datetime.datetime.strptime(days_range[-1], \"%Y-%m-%d\")\n", | ||
| 94 | + " + datetime.timedelta(days=1)).strftime(\"%Y-%m-%d\") # setUntil이 끝을 포함하지 않으므로, day + 1\n", | ||
| 95 | + "\n", | ||
| 96 | + "my_key = input(\"검색할 키워드를 입력해주세요: \")\n", | ||
| 97 | + "\n", | ||
| 98 | + "tweetCriteria = got.manager.TweetCriteria().setQuerySearch(my_key)\\\n", | ||
| 99 | + " .setSince(\"2019-12-08\")\\\n", | ||
| 100 | + " .setUntil(\"2019-12-11\")\\\n", | ||
| 101 | + " .setMaxTweets(10)\n", | ||
| 102 | + "result_list = get_tweets(tweetCriteria)\n", | ||
| 103 | + "\n", | ||
| 104 | + "i = 0\n", | ||
| 105 | + "for row in result_list : # 이름 내용 날짜 링크\n", | ||
| 106 | + " sql = \"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values({},'{}','{}','{}','{}')\".format(i,row['username'],row['text'],row['time'],row['link'])\n", | ||
| 107 | + " print(sql)\n", | ||
| 108 | + " i = i + 1\n", | ||
| 109 | + " curs.execute(sql)\n", | ||
| 110 | + "conn.commit()\n", | ||
| 111 | + "conn.close()\n" | ||
| 112 | + ] | ||
| 113 | + }, | ||
| 114 | + { | ||
| 115 | + "cell_type": "code", | ||
| 116 | + "execution_count": null, | ||
| 117 | + "metadata": {}, | ||
| 118 | + "outputs": [], | ||
| 119 | + "source": [] | ||
| 120 | + } | ||
| 121 | + ], | ||
| 122 | + "metadata": { | ||
| 123 | + "kernelspec": { | ||
| 124 | + "display_name": "Python 3", | ||
| 125 | + "language": "python", | ||
| 126 | + "name": "python3" | ||
| 127 | + }, | ||
| 128 | + "language_info": { | ||
| 129 | + "codemirror_mode": { | ||
| 130 | + "name": "ipython", | ||
| 131 | + "version": 3 | ||
| 132 | + }, | ||
| 133 | + "file_extension": ".py", | ||
| 134 | + "mimetype": "text/x-python", | ||
| 135 | + "name": "python", | ||
| 136 | + "nbconvert_exporter": "python", | ||
| 137 | + "pygments_lexer": "ipython3", | ||
| 138 | + "version": "3.7.3" | ||
| 139 | + } | ||
| 140 | + }, | ||
| 141 | + "nbformat": 4, | ||
| 142 | + "nbformat_minor": 2 | ||
| 143 | +} |
End/downloader.py
0 → 100644
| 1 | +#!/usr/bin/env python | ||
| 2 | +from __future__ import print_function | ||
| 3 | +import sys | ||
| 4 | +import os | ||
| 5 | +import time | ||
| 6 | +import json | ||
| 7 | +import requests | ||
| 8 | +import argparse | ||
| 9 | +import lxml.html | ||
| 10 | +import io | ||
| 11 | +from urllib.parse import urlparse, parse_qs | ||
| 12 | +from lxml.cssselect import CSSSelector | ||
| 13 | +YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}' | ||
| 14 | +YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax' | ||
| 15 | +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' | ||
| 16 | +def find_value(html, key, num_chars=2): | ||
| 17 | + pos_begin = html.find(key) + len(key) + num_chars | ||
| 18 | + pos_end = html.find('"', pos_begin) | ||
| 19 | + return html[pos_begin: pos_end] | ||
| 20 | +def extract_comments(html): | ||
| 21 | + tree = lxml.html.fromstring(html) | ||
| 22 | + item_sel = CSSSelector('.comment-item') | ||
| 23 | + text_sel = CSSSelector('.comment-text-content') | ||
| 24 | + time_sel = CSSSelector('.time') | ||
| 25 | + author_sel = CSSSelector('.user-name') | ||
| 26 | + for item in item_sel(tree): | ||
| 27 | + yield {'cid': item.get('data-cid'), | ||
| 28 | + 'text': text_sel(item)[0].text_content(), | ||
| 29 | + 'time': time_sel(item)[0].text_content().strip(), | ||
| 30 | + 'author': author_sel(item)[0].text_content()} | ||
| 31 | +def extract_reply_cids(html): | ||
| 32 | + tree = lxml.html.fromstring(html) | ||
| 33 | + sel = CSSSelector('.comment-replies-header > .load-comments') | ||
| 34 | + return [i.get('data-cid') for i in sel(tree)] | ||
| 35 | +def ajax_request(session, url, params, data, retries=10, sleep=20): | ||
| 36 | + for _ in range(retries): | ||
| 37 | + response = session.post(url, params=params, data=data) | ||
| 38 | + if response.status_code == 200: | ||
| 39 | + response_dict = json.loads(response.text) | ||
| 40 | + return response_dict.get('page_token', None), response_dict['html_content'] | ||
| 41 | + else: | ||
| 42 | + time.sleep(sleep) | ||
| 43 | +def download_comments(youtube_id, sleep=1): | ||
| 44 | + session = requests.Session() | ||
| 45 | + session.headers['User-Agent'] = USER_AGENT | ||
| 46 | + # Get Youtube page with initial comments | ||
| 47 | + response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id)) | ||
| 48 | + html = response.text | ||
| 49 | + reply_cids = extract_reply_cids(html) | ||
| 50 | + ret_cids = [] | ||
| 51 | + for comment in extract_comments(html): | ||
| 52 | + ret_cids.append(comment['cid']) | ||
| 53 | + yield comment | ||
| 54 | + page_token = find_value(html, 'data-token') | ||
| 55 | + session_token = find_value(html, 'XSRF_TOKEN', 4) | ||
| 56 | + first_iteration = True | ||
| 57 | + # Get remaining comments (the same as pressing the 'Show more' button) | ||
| 58 | + while page_token: | ||
| 59 | + data = {'video_id': youtube_id, | ||
| 60 | + 'session_token': session_token} | ||
| 61 | + params = {'action_load_comments': 1, | ||
| 62 | + 'order_by_time': True, | ||
| 63 | + 'filter': youtube_id} | ||
| 64 | + if first_iteration: | ||
| 65 | + params['order_menu'] = True | ||
| 66 | + else: | ||
| 67 | + data['page_token'] = page_token | ||
| 68 | + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | ||
| 69 | + if not response: | ||
| 70 | + break | ||
| 71 | + page_token, html = response | ||
| 72 | + reply_cids += extract_reply_cids(html) | ||
| 73 | + for comment in extract_comments(html): | ||
| 74 | + if comment['cid'] not in ret_cids: | ||
| 75 | + ret_cids.append(comment['cid']) | ||
| 76 | + yield comment | ||
| 77 | + first_iteration = False | ||
| 78 | + time.sleep(sleep) | ||
| 79 | + # Get replies (the same as pressing the 'View all X replies' link) | ||
| 80 | + for cid in reply_cids: | ||
| 81 | + data = {'comment_id': cid, | ||
| 82 | + 'video_id': youtube_id, | ||
| 83 | + 'can_reply': 1, | ||
| 84 | + 'session_token': session_token} | ||
| 85 | + params = {'action_load_replies': 1, | ||
| 86 | + 'order_by_time': True, | ||
| 87 | + 'filter': youtube_id, | ||
| 88 | + 'tab': 'inbox'} | ||
| 89 | + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | ||
| 90 | + if not response: | ||
| 91 | + break | ||
| 92 | + _, html = response | ||
| 93 | + for comment in extract_comments(html): | ||
| 94 | + if comment['cid'] not in ret_cids: | ||
| 95 | + ret_cids.append(comment['cid']) | ||
| 96 | + yield comment | ||
| 97 | + time.sleep(sleep) | ||
| 98 | +## input video 값 parsing | ||
| 99 | +def video_id(value): | ||
| 100 | + query = urlparse(value) | ||
| 101 | + if query.hostname == 'youtu.be': | ||
| 102 | + return query.path[1:] | ||
| 103 | + if query.hostname in ('www.youtube.com', 'youtube.com'): | ||
| 104 | + if query.path == '/watch': | ||
| 105 | + p = parse_qs(query.query) | ||
| 106 | + return p['v'][0] | ||
| 107 | + if query.path[:7] == '/embed/': | ||
| 108 | + return query.path.split('/')[2] | ||
| 109 | + if query.path[:3] == '/v/': | ||
| 110 | + return query.path.split('/')[2] | ||
| 111 | + # fail? | ||
| 112 | + return None | ||
| 113 | +def main(): | ||
| 114 | + #parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API')) | ||
| 115 | + #parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit') | ||
| 116 | + #parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments') | ||
| 117 | + #parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)') | ||
| 118 | + #parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments') | ||
| 119 | + Youtube_id1 = input('Youtube_ID 입력 :') | ||
| 120 | + ## Cutting Link를 받고 id만 딸 수 있도록 | ||
| 121 | + Youtube_id2 = Youtube_id1 | ||
| 122 | + Youtube_id1 = video_id(Youtube_id1) | ||
| 123 | + youtube_id = Youtube_id1 | ||
| 124 | + try: | ||
| 125 | + # args = parser.parse_args(argv) | ||
| 126 | + #youtube_id = args.youtubeid | ||
| 127 | + #output = args.output | ||
| 128 | + #limit = args.limit | ||
| 129 | + result_List = [] | ||
| 130 | + ## input 값을 받고 값에 할당 | ||
| 131 | + ## Limit에 빈 값이 들어갈 경우 Default 값으로 100을 넣게 하였음 | ||
| 132 | + if not youtube_id : | ||
| 133 | + #parser.print_usage() | ||
| 134 | + #raise ValueError('you need to specify a Youtube ID and an output filename') | ||
| 135 | + raise ValueError('올바른 입력 값을 입력하세요') | ||
| 136 | + print('Downloading Youtube comments for video:', youtube_id) | ||
| 137 | + Number = 1 | ||
| 138 | + if Number == '0' : | ||
| 139 | + Output1 = input('결과를 받을 파일 입력 :') | ||
| 140 | + Limit1 = input('제한 갯수 입력 : ') | ||
| 141 | + if Limit1 == '' : | ||
| 142 | + Limit1 = 100 | ||
| 143 | + Limit1 = int(Limit1) | ||
| 144 | + limit = int(Limit1) | ||
| 145 | + output = Output1 | ||
| 146 | + ##### argument로 받지 않고 input으로 받기 위한 것 | ||
| 147 | + with io.open(output, 'w', encoding='utf8') as fp: | ||
| 148 | + for comment in download_comments(youtube_id): | ||
| 149 | + comment_json = json.dumps(comment, ensure_ascii=False) | ||
| 150 | + print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp) | ||
| 151 | + count += 1 | ||
| 152 | + sys.stdout.flush() | ||
| 153 | + if limit and count >= limit: | ||
| 154 | + print('Downloaded {} comment(s)\r'.format(count)) | ||
| 155 | + print('\nDone!') | ||
| 156 | + break | ||
| 157 | + else : | ||
| 158 | + count = 0 | ||
| 159 | + i = 0 | ||
| 160 | + limit = 100 | ||
| 161 | + for comment in download_comments(youtube_id): | ||
| 162 | + dic = {} | ||
| 163 | + dic['cid'] = comment['cid'] | ||
| 164 | + dic['text'] = str(comment['text']) | ||
| 165 | + dic['time'] = comment['time'] | ||
| 166 | + dic['author'] = comment['author'] | ||
| 167 | + dic['link'] = Youtube_id2 | ||
| 168 | + result_List.append(dic) | ||
| 169 | + count += 1 | ||
| 170 | + i += 1 | ||
| 171 | + if limit == count : | ||
| 172 | + print(' Comment Thread 생성 완료') | ||
| 173 | + print ('\n\n\n\n\n\n\n') | ||
| 174 | + break | ||
| 175 | + return result_List | ||
| 176 | + #goto_Menu(result_List) | ||
| 177 | + except Exception as e: | ||
| 178 | + print('Error:', str(e)) | ||
| 179 | + sys.exit(1) | ||
| 180 | +if __name__ == "__main__": | ||
| 181 | + main() |
End/schema.sql
0 → 100644
| 1 | +-- MySQL Workbench Forward Engineering | ||
| 2 | + | ||
| 3 | +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; | ||
| 4 | +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; | ||
| 5 | +SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; | ||
| 6 | + | ||
| 7 | +-- ----------------------------------------------------- | ||
| 8 | +-- Schema os_db | ||
| 9 | +-- ----------------------------------------------------- | ||
| 10 | + | ||
| 11 | +-- ----------------------------------------------------- | ||
| 12 | +-- Schema os_db | ||
| 13 | +-- ----------------------------------------------------- | ||
| 14 | +CREATE SCHEMA IF NOT EXISTS `os_db` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci ; | ||
| 15 | +USE `os_db` ; | ||
| 16 | + | ||
| 17 | +-- ----------------------------------------------------- | ||
| 18 | +-- Table `os_db`.`naver` | ||
| 19 | +-- ----------------------------------------------------- | ||
| 20 | +CREATE TABLE IF NOT EXISTS `os_db`.`naver` ( | ||
| 21 | + `Naver_ID` INT(11) NOT NULL, | ||
| 22 | + `Naver_Name` VARCHAR(6000) NULL DEFAULT NULL, | ||
| 23 | + `Naver_Text` VARCHAR(6000) NULL DEFAULT NULL, | ||
| 24 | + `Naver_Date` VARCHAR(100) NULL DEFAULT NULL, | ||
| 25 | + `Naver_Link` VARCHAR(1000) NULL DEFAULT NULL, | ||
| 26 | + PRIMARY KEY (`Naver_ID`)) | ||
| 27 | +ENGINE = InnoDB | ||
| 28 | +DEFAULT CHARACTER SET = utf8; | ||
| 29 | + | ||
| 30 | + | ||
| 31 | +-- ----------------------------------------------------- | ||
| 32 | +-- Table `os_db`.`twitter` | ||
| 33 | +-- ----------------------------------------------------- | ||
| 34 | +CREATE TABLE IF NOT EXISTS `os_db`.`twitter` ( | ||
| 35 | + `Twitter_ID` INT(11) NOT NULL, | ||
| 36 | + `Twitter_Name` VARCHAR(6000) NULL DEFAULT NULL, | ||
| 37 | + `Twitter_Link` VARCHAR(6000) NULL DEFAULT NULL, | ||
| 38 | + `Twitter_Date` VARCHAR(100) NULL DEFAULT NULL, | ||
| 39 | + `Twitter_Text` VARCHAR(6000) NULL DEFAULT NULL, | ||
| 40 | + PRIMARY KEY (`Twitter_ID`)) | ||
| 41 | +ENGINE = InnoDB | ||
| 42 | +DEFAULT CHARACTER SET = utf8; | ||
| 43 | + | ||
| 44 | + | ||
| 45 | +-- ----------------------------------------------------- | ||
| 46 | +-- Table `os_db`.`user` | ||
| 47 | +-- ----------------------------------------------------- | ||
| 48 | +CREATE TABLE IF NOT EXISTS `os_db`.`user` ( | ||
| 49 | + `User_ID` INT(11) NOT NULL, | ||
| 50 | + PRIMARY KEY (`User_ID`)) | ||
| 51 | +ENGINE = InnoDB | ||
| 52 | +DEFAULT CHARACTER SET = utf8mb4 | ||
| 53 | +COLLATE = utf8mb4_unicode_ci; | ||
| 54 | + | ||
| 55 | + | ||
| 56 | +-- ----------------------------------------------------- | ||
| 57 | +-- Table `os_db`.`user_naver` | ||
| 58 | +-- ----------------------------------------------------- | ||
| 59 | +CREATE TABLE IF NOT EXISTS `os_db`.`user_naver` ( | ||
| 60 | + `User_ID` INT(11) NOT NULL, | ||
| 61 | + `Naver_ID` INT(11) NOT NULL, | ||
| 62 | + PRIMARY KEY (`User_ID`, `Naver_ID`), | ||
| 63 | + INDEX `UN_Naver_idx` (`Naver_ID` ASC) , | ||
| 64 | + CONSTRAINT `UN_Naver` | ||
| 65 | + FOREIGN KEY (`Naver_ID`) | ||
| 66 | + REFERENCES `os_db`.`naver` (`Naver_ID`), | ||
| 67 | + CONSTRAINT `UN_user` | ||
| 68 | + FOREIGN KEY (`User_ID`) | ||
| 69 | + REFERENCES `os_db`.`user` (`User_ID`)) | ||
| 70 | +ENGINE = InnoDB | ||
| 71 | +DEFAULT CHARACTER SET = utf8mb4 | ||
| 72 | +COLLATE = utf8mb4_unicode_ci; | ||
| 73 | + | ||
| 74 | + | ||
| 75 | +-- ----------------------------------------------------- | ||
| 76 | +-- Table `os_db`.`user_twitter` | ||
| 77 | +-- ----------------------------------------------------- | ||
| 78 | +CREATE TABLE IF NOT EXISTS `os_db`.`user_twitter` ( | ||
| 79 | + `User_ID` INT(11) NOT NULL, | ||
| 80 | + `Twitter_ID` INT(11) NOT NULL, | ||
| 81 | + PRIMARY KEY (`User_ID`, `Twitter_ID`), | ||
| 82 | + INDEX `UT_twitter_idx` (`Twitter_ID` ASC) , | ||
| 83 | + CONSTRAINT `UT_twitter` | ||
| 84 | + FOREIGN KEY (`Twitter_ID`) | ||
| 85 | + REFERENCES `os_db`.`twitter` (`Twitter_ID`), | ||
| 86 | + CONSTRAINT `UT_user` | ||
| 87 | + FOREIGN KEY (`User_ID`) | ||
| 88 | + REFERENCES `os_db`.`user` (`User_ID`)) | ||
| 89 | +ENGINE = InnoDB | ||
| 90 | +DEFAULT CHARACTER SET = utf8mb4 | ||
| 91 | +COLLATE = utf8mb4_unicode_ci; | ||
| 92 | + | ||
| 93 | + | ||
| 94 | +-- ----------------------------------------------------- | ||
| 95 | +-- Table `os_db`.`youtube` | ||
| 96 | +-- ----------------------------------------------------- | ||
| 97 | +CREATE TABLE IF NOT EXISTS `os_db`.`youtube` ( | ||
| 98 | + `Youtube_ID` INT(11) NOT NULL, | ||
| 99 | + `Youtube_Text` VARCHAR(12000) NULL DEFAULT NULL, | ||
| 100 | + `Youtube_Name` VARCHAR(6000) NULL DEFAULT NULL, | ||
| 101 | + `Youtube_Date` VARCHAR(100) NULL DEFAULT NULL, | ||
| 102 | + `Youtube_Link` VARCHAR(1000) NULL DEFAULT NULL, | ||
| 103 | + PRIMARY KEY (`Youtube_ID`)) | ||
| 104 | +ENGINE = MyISAM | ||
| 105 | +DEFAULT CHARACTER SET = utf8; | ||
| 106 | + | ||
| 107 | + | ||
| 108 | +-- ----------------------------------------------------- | ||
| 109 | +-- Table `os_db`.`user_youtube` | ||
| 110 | +-- ----------------------------------------------------- | ||
| 111 | +CREATE TABLE IF NOT EXISTS `os_db`.`user_youtube` ( | ||
| 112 | + `User_ID` INT(11) NOT NULL, | ||
| 113 | + `Youtube_ID` INT(11) NOT NULL, | ||
| 114 | + PRIMARY KEY (`User_ID`, `Youtube_ID`), | ||
| 115 | + INDEX `UY_youtube_idx` (`Youtube_ID` ASC) , | ||
| 116 | + CONSTRAINT `UY_user` | ||
| 117 | + FOREIGN KEY (`User_ID`) | ||
| 118 | + REFERENCES `os_db`.`user` (`User_ID`), | ||
| 119 | + CONSTRAINT `UY_youtube` | ||
| 120 | + FOREIGN KEY (`Youtube_ID`) | ||
| 121 | + REFERENCES `os_db`.`youtube` (`Youtube_ID`)) | ||
| 122 | +ENGINE = InnoDB | ||
| 123 | +DEFAULT CHARACTER SET = utf8mb4 | ||
| 124 | +COLLATE = utf8mb4_unicode_ci; | ||
| 125 | + | ||
| 126 | + | ||
| 127 | +SET SQL_MODE=@OLD_SQL_MODE; | ||
| 128 | +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; | ||
| 129 | +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; |
End/youtube_crawl2.py
0 → 100644
| 1 | +import downloader | ||
| 2 | +import pymysql | ||
| 3 | +import csv | ||
| 4 | +import random | ||
| 5 | + | ||
| 6 | +conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8') | ||
| 7 | +curs = conn.cursor() | ||
| 8 | + | ||
| 9 | +def call_main (): | ||
| 10 | + print(' Comment Thread 생성중 \n') | ||
| 11 | + print(' **************************************************************') | ||
| 12 | + print(' **************************************************************') | ||
| 13 | + print(' **************************************************************') | ||
| 14 | + print(' **************** 생성 완료 정보를 입력하세요. **************** ') | ||
| 15 | + print(' **************************************************************') | ||
| 16 | + print(' **************************************************************') | ||
| 17 | + print(' **************************************************************') | ||
| 18 | + a = downloader.main() | ||
| 19 | + return a | ||
| 20 | + | ||
| 21 | +CommentList = call_main() ## dic 형식으로 cid, text, time, author | ||
| 22 | +i = 0 | ||
| 23 | +for row in CommentList : | ||
| 24 | + temp = row['text'].replace("'",'') | ||
| 25 | + sql = "insert into youtube (Youtube_ID,Youtube_Text,Youtube_Date,Youtube_Name,Youtube_Link) values({},'{}','{}','{}','{}')".format(i,temp,row['time'],row['author'],row['link']) | ||
| 26 | + print(sql) | ||
| 27 | + i = i + 1 | ||
| 28 | + curs.execute(sql) | ||
| 29 | +conn.commit() | ||
| 30 | +conn.close() |
-
Please register or login to post a comment