Schema and Data set

김건
Commit f8a2482b77a664f41ac8944c8378fbb7ea2a0c4f f8a2482b 1 parent 17c221a1
Showing 5 changed files with 553 additions and 0 deletions
End/Naver_crawl.py
End/Twitter_Input.ipynb
End/downloader.py
End/schema.sql
End/youtube_crawl2.py
--- a/End/Naver_crawl.py 0 → 100644
View file @f8a2482
+++ b/End/Naver_crawl.py 0 → 100644
View file @f8a2482
+ from selenium import webdriver
+ from selenium.common import exceptions
+ from bs4 import BeautifulSoup
+ import time
+ import pymysql
+ 
+ conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')
+ curs = conn.cursor()
+ def getData(url):
+     ## chrome option
+     options = webdriver.ChromeOptions()
+     #options.add_argument('headless')
+     #options.add_argument("disable-gpu")
+     _url = url # 크롤링할 URL
+     webDriver = "C:\\Users\\KimGun\\Desktop\\chromedriver_win32\\chromedriver.exe"  # 내 웹드라이버 위치
+     driver = webdriver.Chrome(webDriver,chrome_options=options)
+     #driver = webdriver.Chrome(webDriver)
+     driver.get(_url)
+     pageCnt = 0
+     driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함
+     try:
+         while True: # 댓글 페이지 끝날때까지 돌림
+             #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)
+             time.sleep(0.5)
+             driver.find_element_by_css_selector(".u_cbox_btn_more").click()
+             pageCnt = pageCnt+1
+ 
+     except exceptions.ElementNotVisibleException as e: # 페이지가 끝남
+         pass
+ 
+     except Exception as e: # 다른 예외 발생시 확인
+         print(e)
+ 
+     pageSource = driver.page_source # 페이지 소스를 따와서
+     result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용
+     # nickname, text, time을 raw하게 뽑아온다
+     comments_raw = result.find_all("span", {"class" : "u_cbox_contents"})
+     nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"})
+     times_raw = result.find_all("span", {"class" : "u_cbox_date"})
+ 
+ 
+ 
+     # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다
+     comments = [comment.text for comment in comments_raw]
+     nicknames = [nickname.text for nickname in nicknames_raw]
+     times = [time.text for time in times_raw]
+ 
+ 
+     naverNewsList = []
+ 
+     for i in range(len(comments)):
+         info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}
+         naverNewsList.append(info_dic)
+ 
+     print(naverNewsList)
+     return naverNewsList
+     #driver.quit()
+ 
+ _url = input('검색하고자 하는 url을 입력해주세요: ')
+ print('comment_list를 가져오는 중.....')
+ cList = getData(_url)
+ i = 194
+ for row in cList : ## Name, Text, time
+     temp = row['comment'].replace("'",'')
+     sql = "insert into naver (Naver_ID,Naver_Name,Naver_Text,Naver_Date,Naver_link) values({},'{}','{}','{}','{}')".format(i,row['userID'],temp,row['time'],_url)
+     print(sql)
+     i = i + 1
+     curs.execute(sql)
+ conn.commit()
+ conn.close()
--- a/End/Twitter_Input.ipynb 0 → 100644
View file @f8a2482
+++ b/End/Twitter_Input.ipynb 0 → 100644
View file @f8a2482
+ {
+  "cells": [
+   {
+    "cell_type": "code",
+    "execution_count": 6,
+    "metadata": {},
+    "outputs": [
+     {
+      "name": "stdout",
+      "output_type": "stream",
+      "text": [
+       "=== 기본으로 설정된 트윗 수집 기간은 2019-12-08 에서 2019-12-10 까지 입니다 ===\n",
+       "=== 총 3일 간의 데이터 수집 ===\n",
+       "검색할 키워드를 입력해주세요: 이동찬\n"
+      ]
+     },
+     {
+      "data": {
+       "application/vnd.jupyter.widget-view+json": {
+        "model_id": "55c5a56d9ba7478f80d07518e22a3177",
+        "version_major": 2,
+        "version_minor": 0
+       },
+       "text/plain": [
+        "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
+       ]
+      },
+      "metadata": {},
+      "output_type": "display_data"
+     },
+     {
+      "name": "stdout",
+      "output_type": "stream",
+      "text": [
+       "\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(0,'r9OWHkiDE9EG3W9','I reached alamo for take my money back several times. They only keep showing me wrong information. Maybe ai works for it.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195691775451136')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(1,'r9OWHkiDE9EG3W9','Nope. I gave up 431 dollars already. I just want ppl not to lose their money in a pleasant place.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195233975504896')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(2,'optimum0524','IMF 환란이전 96년쯤 예금금리가 12%였던걸로 기억합니다. 80년대 코오롱그룹 이동찬 회장은 장영자에게 어음수표깡을 받으면서 50%이상의 금리를 적용받았다고하니 정말 요즘기준으로는 이해하기 힘든 시대였지요.','2019-12-10','https://twitter.com/optimum0524/status/1204190641866956801')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(3,'Naerum10','@이동찬','2019-12-09','https://twitter.com/Naerum10/status/1203920823725121537')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(4,'4bur0','이동찬 달려오는 폼이 너무 웃겨','2019-12-08','https://twitter.com/4bur0/status/1203736355584393216')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(5,'r9OWHkiDE9EG3W9','And i tried to contact to the headquarters. They only says “contact the branch”. So irresponsible and irritating reaction to customers.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517328811417600')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(6,'r9OWHkiDE9EG3W9','They told me deposit had been refunded at that time, but now i know they gave me a bullshit.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517327506993152')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(7,'r9OWHkiDE9EG3W9','If u use cash, there wouldn’t remain any record or deposit back at all.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517326278053888')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(8,'r9OWHkiDE9EG3W9','Ppl!! Warning!! At #1778 ala moana blvd, DO NOT use cash!!!','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517325023928320')\n",
+       "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(9,'r9OWHkiDE9EG3W9','@Alamo hello. Alamo and ppl who rent a car at beautiful hawaii~','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517323283324928')\n"
+      ]
+     }
+    ],
+    "source": [
+     "import GetOldTweets3 as got\n",
+     "from bs4 import BeautifulSoup\n",
+     "import pymysql\n",
+     "import datetime\n",
+     "import time\n",
+     "from random import uniform\n",
+     "from tqdm import tqdm_notebook\n",
+     "conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')\n",
+     "curs = conn.cursor()\n",
+     "\n",
+     "def get_tweets(criteria):\n",
+     "    tweet = got.manager.TweetManager.getTweets(criteria)\n",
+     "    tweet_list = []\n",
+     "\n",
+     "    for index in tqdm_notebook(tweet):\n",
+     "\n",
+     "        # 메타데이터 목록\n",
+     "        username = index.username\n",
+     "        link = index.permalink\n",
+     "        content = index.text\n",
+     "        tweet_date = index.date.strftime(\"%Y-%m-%d\")\n",
+     "        retweets = index.retweets\n",
+     "        favorites = index.favorites\n",
+     "\n",
+     "        # 결과 합치기\n",
+     "        info_list = {'username' : username, 'text': content, 'time': tweet_date, 'link': link}\n",
+     "        tweet_list.append(info_list)\n",
+     "        # 휴식\n",
+     "        time.sleep(uniform(1,2))\n",
+     "    return tweet_list\n",
+     "days_range = []\n",
+     "\n",
+     "start = datetime.datetime.strptime(\"2019-12-08\", \"%Y-%m-%d\")\n",
+     "end = datetime.datetime.strptime(\"2019-12-11\", \"%Y-%m-%d\")\n",
+     "date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]\n",
+     "\n",
+     "for date in date_generated:\n",
+     "    days_range.append(date.strftime(\"%Y-%m-%d\"))\n",
+     "print(\"=== 기본으로 설정된 트윗 수집 기간은 {} 에서 {} 까지 입니다 ===\".format(days_range[0], days_range[-1]))\n",
+     "print(\"=== 총 {}일 간의 데이터 수집 ===\".format(len(days_range)))\n",
+     "\n",
+     "# 수집 기간 맞추기\n",
+     "start_date = days_range[0]\n",
+     "end_date = (datetime.datetime.strptime(days_range[-1], \"%Y-%m-%d\")\n",
+     "            + datetime.timedelta(days=1)).strftime(\"%Y-%m-%d\") # setUntil이 끝을 포함하지 않으므로, day + 1\n",
+     "\n",
+     "my_key = input(\"검색할 키워드를 입력해주세요: \")\n",
+     "\n",
+     "tweetCriteria = got.manager.TweetCriteria().setQuerySearch(my_key)\\\n",
+     "                                           .setSince(\"2019-12-08\")\\\n",
+     "                                           .setUntil(\"2019-12-11\")\\\n",
+     "                                           .setMaxTweets(10)\n",
+     "result_list = get_tweets(tweetCriteria)\n",
+     "\n",
+     "i = 0\n",
+     "for row in result_list : # 이름 내용 날짜 링크\n",
+     "    sql = \"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values({},'{}','{}','{}','{}')\".format(i,row['username'],row['text'],row['time'],row['link'])\n",
+     "    print(sql)\n",
+     "    i = i + 1\n",
+     "    curs.execute(sql)\n",
+     "conn.commit()\n",
+     "conn.close()\n"
+    ]
+   },
+   {
+    "cell_type": "code",
+    "execution_count": null,
+    "metadata": {},
+    "outputs": [],
+    "source": []
+   }
+  ],
+  "metadata": {
+   "kernelspec": {
+    "display_name": "Python 3",
+    "language": "python",
+    "name": "python3"
+   },
+   "language_info": {
+    "codemirror_mode": {
+     "name": "ipython",
+     "version": 3
+    },
+    "file_extension": ".py",
+    "mimetype": "text/x-python",
+    "name": "python",
+    "nbconvert_exporter": "python",
+    "pygments_lexer": "ipython3",
+    "version": "3.7.3"
+   }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+ }
--- a/End/downloader.py 0 → 100644
View file @f8a2482
+++ b/End/downloader.py 0 → 100644
View file @f8a2482
+ #!/usr/bin/env python
+ from __future__ import print_function
+ import sys
+ import os
+ import time
+ import json
+ import requests
+ import argparse
+ import lxml.html
+ import io
+ from urllib.parse import urlparse, parse_qs
+ from lxml.cssselect import CSSSelector
+ YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
+ YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
+ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
+ def find_value(html, key, num_chars=2):
+     pos_begin = html.find(key) + len(key) + num_chars
+     pos_end = html.find('"', pos_begin)
+     return html[pos_begin: pos_end]
+ def extract_comments(html):
+     tree = lxml.html.fromstring(html)
+     item_sel = CSSSelector('.comment-item')
+     text_sel = CSSSelector('.comment-text-content')
+     time_sel = CSSSelector('.time')
+     author_sel = CSSSelector('.user-name')
+     for item in item_sel(tree):
+         yield {'cid': item.get('data-cid'),
+                'text': text_sel(item)[0].text_content(),
+                'time': time_sel(item)[0].text_content().strip(),
+                'author': author_sel(item)[0].text_content()}
+ def extract_reply_cids(html):
+     tree = lxml.html.fromstring(html)
+     sel = CSSSelector('.comment-replies-header > .load-comments')
+     return [i.get('data-cid') for i in sel(tree)]
+ def ajax_request(session, url, params, data, retries=10, sleep=20):
+     for _ in range(retries):
+         response = session.post(url, params=params, data=data)
+         if response.status_code == 200:
+             response_dict = json.loads(response.text)
+             return response_dict.get('page_token', None), response_dict['html_content']
+         else:
+             time.sleep(sleep)
+ def download_comments(youtube_id, sleep=1):
+     session = requests.Session()
+     session.headers['User-Agent'] = USER_AGENT
+     # Get Youtube page with initial comments
+     response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
+     html = response.text
+     reply_cids = extract_reply_cids(html)
+     ret_cids = []
+     for comment in extract_comments(html):
+         ret_cids.append(comment['cid'])
+         yield comment
+     page_token = find_value(html, 'data-token')
+     session_token = find_value(html, 'XSRF_TOKEN', 4)
+     first_iteration = True
+     # Get remaining comments (the same as pressing the 'Show more' button)
+     while page_token:
+         data = {'video_id': youtube_id,
+                 'session_token': session_token}
+         params = {'action_load_comments': 1,
+                   'order_by_time': True,
+                   'filter': youtube_id}
+         if first_iteration:
+             params['order_menu'] = True
+         else:
+             data['page_token'] = page_token
+         response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+         if not response:
+             break
+         page_token, html = response
+         reply_cids += extract_reply_cids(html)
+         for comment in extract_comments(html):
+             if comment['cid'] not in ret_cids:
+                 ret_cids.append(comment['cid'])
+                 yield comment
+         first_iteration = False
+         time.sleep(sleep)
+     # Get replies (the same as pressing the 'View all X replies' link)
+     for cid in reply_cids:
+         data = {'comment_id': cid,
+                 'video_id': youtube_id,
+                 'can_reply': 1,
+                 'session_token': session_token}
+         params = {'action_load_replies': 1,
+                   'order_by_time': True,
+                   'filter': youtube_id,
+                   'tab': 'inbox'}
+         response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+         if not response:
+             break
+         _, html = response
+         for comment in extract_comments(html):
+             if comment['cid'] not in ret_cids:
+                 ret_cids.append(comment['cid'])
+                 yield comment
+         time.sleep(sleep)
+ ## input video 값 parsing
+ def video_id(value):
+     query = urlparse(value)
+     if query.hostname == 'youtu.be':
+         return query.path[1:]
+     if query.hostname in ('www.youtube.com', 'youtube.com'):
+         if query.path == '/watch':
+             p = parse_qs(query.query)
+             return p['v'][0]
+         if query.path[:7] == '/embed/':
+             return query.path.split('/')[2]
+         if query.path[:3] == '/v/':
+             return query.path.split('/')[2]
+     # fail?
+     return None
+ def main():
+     #parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
+     #parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
+     #parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
+     #parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
+     #parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments')
+     Youtube_id1 = input('Youtube_ID 입력 :')
+     ## Cutting Link를 받고 id만 딸 수 있도록
+     Youtube_id2 = Youtube_id1
+     Youtube_id1 = video_id(Youtube_id1)
+     youtube_id = Youtube_id1
+     try:
+         # args = parser.parse_args(argv)
+         #youtube_id = args.youtubeid
+         #output = args.output
+         #limit = args.limit
+         result_List = []
+     ## input 값을 받고 값에 할당
+     ## Limit에 빈 값이 들어갈 경우 Default 값으로 100을 넣게 하였음
+         if not youtube_id :
+             #parser.print_usage()
+             #raise ValueError('you need to specify a Youtube ID and an output filename')
+             raise ValueError('올바른 입력 값을 입력하세요')
+         print('Downloading Youtube comments for video:', youtube_id)
+         Number = 1
+         if Number == '0' :
+             Output1 = input('결과를 받을 파일 입력 :')
+             Limit1 = input('제한 갯수 입력 : ')
+             if Limit1 == '' :
+                 Limit1 = 100
+                 Limit1 = int(Limit1)
+             limit = int(Limit1)
+             output = Output1
+                 ##### argument로 받지 않고 input으로 받기 위한 것
+             with io.open(output, 'w', encoding='utf8') as fp:
+                 for comment in download_comments(youtube_id):
+                     comment_json = json.dumps(comment, ensure_ascii=False)
+                     print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp)
+                     count += 1
+                     sys.stdout.flush()
+                     if limit and count >= limit:
+                         print('Downloaded {} comment(s)\r'.format(count))
+                         print('\nDone!')
+                         break
+         else :
+             count = 0
+             i = 0
+             limit = 100
+             for comment in download_comments(youtube_id):
+                 dic = {}
+                 dic['cid'] = comment['cid']
+                 dic['text'] = str(comment['text'])
+                 dic['time'] = comment['time']
+                 dic['author'] = comment['author']
+                 dic['link'] = Youtube_id2
+                 result_List.append(dic)
+                 count += 1
+                 i += 1
+                 if limit  == count :
+                     print(' Comment Thread 생성 완료')
+                     print ('\n\n\n\n\n\n\n')
+                     break
+         return result_List
+         #goto_Menu(result_List)
+     except Exception as e:
+         print('Error:', str(e))
+         sys.exit(1)
+ if __name__ == "__main__":
+     main()
--- a/End/schema.sql 0 → 100644
View file @f8a2482
+++ b/End/schema.sql 0 → 100644
View file @f8a2482
+ -- MySQL Workbench Forward Engineering
+ 
+ SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
+ SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
+ SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION';
+ 
+ -- -----------------------------------------------------
+ -- Schema os_db
+ -- -----------------------------------------------------
+ 
+ -- -----------------------------------------------------
+ -- Schema os_db
+ -- -----------------------------------------------------
+ CREATE SCHEMA IF NOT EXISTS `os_db` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci ;
+ USE `os_db` ;
+ 
+ -- -----------------------------------------------------
+ -- Table `os_db`.`naver`
+ -- -----------------------------------------------------
+ CREATE TABLE IF NOT EXISTS `os_db`.`naver` (
+   `Naver_ID` INT(11) NOT NULL,
+   `Naver_Name` VARCHAR(6000) NULL DEFAULT NULL,
+   `Naver_Text` VARCHAR(6000) NULL DEFAULT NULL,
+   `Naver_Date` VARCHAR(100) NULL DEFAULT NULL,
+   `Naver_Link` VARCHAR(1000) NULL DEFAULT NULL,
+   PRIMARY KEY (`Naver_ID`))
+ ENGINE = InnoDB
+ DEFAULT CHARACTER SET = utf8;
+ 
+ 
+ -- -----------------------------------------------------
+ -- Table `os_db`.`twitter`
+ -- -----------------------------------------------------
+ CREATE TABLE IF NOT EXISTS `os_db`.`twitter` (
+   `Twitter_ID` INT(11) NOT NULL,
+   `Twitter_Name` VARCHAR(6000) NULL DEFAULT NULL,
+   `Twitter_Link` VARCHAR(6000) NULL DEFAULT NULL,
+   `Twitter_Date` VARCHAR(100) NULL DEFAULT NULL,
+   `Twitter_Text` VARCHAR(6000) NULL DEFAULT NULL,
+   PRIMARY KEY (`Twitter_ID`))
+ ENGINE = InnoDB
+ DEFAULT CHARACTER SET = utf8;
+ 
+ 
+ -- -----------------------------------------------------
+ -- Table `os_db`.`user`
+ -- -----------------------------------------------------
+ CREATE TABLE IF NOT EXISTS `os_db`.`user` (
+   `User_ID` INT(11) NOT NULL,
+   PRIMARY KEY (`User_ID`))
+ ENGINE = InnoDB
+ DEFAULT CHARACTER SET = utf8mb4
+ COLLATE = utf8mb4_unicode_ci;
+ 
+ 
+ -- -----------------------------------------------------
+ -- Table `os_db`.`user_naver`
+ -- -----------------------------------------------------
+ CREATE TABLE IF NOT EXISTS `os_db`.`user_naver` (
+   `User_ID` INT(11) NOT NULL,
+   `Naver_ID` INT(11) NOT NULL,
+   PRIMARY KEY (`User_ID`, `Naver_ID`),
+   INDEX `UN_Naver_idx` (`Naver_ID` ASC) ,
+   CONSTRAINT `UN_Naver`
+     FOREIGN KEY (`Naver_ID`)
+     REFERENCES `os_db`.`naver` (`Naver_ID`),
+   CONSTRAINT `UN_user`
+     FOREIGN KEY (`User_ID`)
+     REFERENCES `os_db`.`user` (`User_ID`))
+ ENGINE = InnoDB
+ DEFAULT CHARACTER SET = utf8mb4
+ COLLATE = utf8mb4_unicode_ci;
+ 
+ 
+ -- -----------------------------------------------------
+ -- Table `os_db`.`user_twitter`
+ -- -----------------------------------------------------
+ CREATE TABLE IF NOT EXISTS `os_db`.`user_twitter` (
+   `User_ID` INT(11) NOT NULL,
+   `Twitter_ID` INT(11) NOT NULL,
+   PRIMARY KEY (`User_ID`, `Twitter_ID`),
+   INDEX `UT_twitter_idx` (`Twitter_ID` ASC) ,
+   CONSTRAINT `UT_twitter`
+     FOREIGN KEY (`Twitter_ID`)
+     REFERENCES `os_db`.`twitter` (`Twitter_ID`),
+   CONSTRAINT `UT_user`
+     FOREIGN KEY (`User_ID`)
+     REFERENCES `os_db`.`user` (`User_ID`))
+ ENGINE = InnoDB
+ DEFAULT CHARACTER SET = utf8mb4
+ COLLATE = utf8mb4_unicode_ci;
+ 
+ 
+ -- -----------------------------------------------------
+ -- Table `os_db`.`youtube`
+ -- -----------------------------------------------------
+ CREATE TABLE IF NOT EXISTS `os_db`.`youtube` (
+   `Youtube_ID` INT(11) NOT NULL,
+   `Youtube_Text` VARCHAR(12000) NULL DEFAULT NULL,
+   `Youtube_Name` VARCHAR(6000) NULL DEFAULT NULL,
+   `Youtube_Date` VARCHAR(100) NULL DEFAULT NULL,
+   `Youtube_Link` VARCHAR(1000) NULL DEFAULT NULL,
+   PRIMARY KEY (`Youtube_ID`))
+ ENGINE = MyISAM
+ DEFAULT CHARACTER SET = utf8;
+ 
+ 
+ -- -----------------------------------------------------
+ -- Table `os_db`.`user_youtube`
+ -- -----------------------------------------------------
+ CREATE TABLE IF NOT EXISTS `os_db`.`user_youtube` (
+   `User_ID` INT(11) NOT NULL,
+   `Youtube_ID` INT(11) NOT NULL,
+   PRIMARY KEY (`User_ID`, `Youtube_ID`),
+   INDEX `UY_youtube_idx` (`Youtube_ID` ASC) ,
+   CONSTRAINT `UY_user`
+     FOREIGN KEY (`User_ID`)
+     REFERENCES `os_db`.`user` (`User_ID`),
+   CONSTRAINT `UY_youtube`
+     FOREIGN KEY (`Youtube_ID`)
+     REFERENCES `os_db`.`youtube` (`Youtube_ID`))
+ ENGINE = InnoDB
+ DEFAULT CHARACTER SET = utf8mb4
+ COLLATE = utf8mb4_unicode_ci;
+ 
+ 
+ SET SQL_MODE=@OLD_SQL_MODE;
+ SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
+ SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;
--- a/End/youtube_crawl2.py 0 → 100644
View file @f8a2482
+++ b/End/youtube_crawl2.py 0 → 100644
View file @f8a2482
+ import downloader
+ import pymysql
+ import csv
+ import random
+ 
+ conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')
+ curs = conn.cursor()
+ 
+ def call_main ():
+     print(' Comment Thread 생성중 \n')
+     print(' **************************************************************')
+     print(' **************************************************************')
+     print(' **************************************************************')
+     print(' **************** 생성 완료 정보를 입력하세요. ****************  ')
+     print(' **************************************************************')
+     print(' **************************************************************')
+     print(' **************************************************************')
+     a = downloader.main()
+     return a
+ 
+ CommentList = call_main() ## dic 형식으로 cid, text, time, author
+ i = 0
+ for row in CommentList :
+     temp = row['text'].replace("'",'')
+     sql = "insert into youtube (Youtube_ID,Youtube_Text,Youtube_Date,Youtube_Name,Youtube_Link) values({},'{}','{}','{}','{}')".format(i,temp,row['time'],row['author'],row['link'])
+     print(sql)
+     i = i + 1
+     curs.execute(sql)
+ conn.commit()
+ conn.close()