Schema and Data set

김건
Commit f8a2482b77a664f41ac8944c8378fbb7ea2a0c4f f8a2482b 1 parent 17c221a1
Showing 5 changed files with 553 additions and 0 deletions
End/Naver_crawl.py
End/Twitter_Input.ipynb
End/downloader.py
End/schema.sql
End/youtube_crawl2.py
--- a/End/Naver_crawl.py 0 → 100644
View file @f8a2482
+++ b/End/Naver_crawl.py 0 → 100644
View file @f8a2482
+from selenium import webdriver
+from selenium.common import exceptions
+from bs4 import BeautifulSoup
+import time
+import pymysql
+
+conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')
+curs = conn.cursor()
+def getData(url):
+    ## chrome option
+    options = webdriver.ChromeOptions()
+    #options.add_argument('headless')
+    #options.add_argument("disable-gpu")
+    _url = url # 크롤링할 URL
+    webDriver = "C:\\Users\\KimGun\\Desktop\\chromedriver_win32\\chromedriver.exe"  # 내 웹드라이버 위치
+    driver = webdriver.Chrome(webDriver,chrome_options=options)
+    #driver = webdriver.Chrome(webDriver)
+    driver.get(_url)
+    pageCnt = 0
+    driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함
+    try:
+        while True: # 댓글 페이지 끝날때까지 돌림
+            #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)
+            time.sleep(0.5)
+            driver.find_element_by_css_selector(".u_cbox_btn_more").click()
+            pageCnt = pageCnt+1
+
+    except exceptions.ElementNotVisibleException as e: # 페이지가 끝남
+        pass
+
+    except Exception as e: # 다른 예외 발생시 확인
+        print(e)
+
+    pageSource = driver.page_source # 페이지 소스를 따와서
+    result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용
+    # nickname, text, time을 raw하게 뽑아온다
+    comments_raw = result.find_all("span", {"class" : "u_cbox_contents"})
+    nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"})
+    times_raw = result.find_all("span", {"class" : "u_cbox_date"})
+
+
+
+    # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다
+    comments = [comment.text for comment in comments_raw]
+    nicknames = [nickname.text for nickname in nicknames_raw]
+    times = [time.text for time in times_raw]
+
+
+    naverNewsList = []
+
+    for i in range(len(comments)):
+        info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}
+        naverNewsList.append(info_dic)
+
+    print(naverNewsList)
+    return naverNewsList
+    #driver.quit()
+
+_url = input('검색하고자 하는 url을 입력해주세요: ')
+print('comment_list를 가져오는 중.....')
+cList = getData(_url)
+i = 194
+for row in cList : ## Name, Text, time
+    temp = row['comment'].replace("'",'')
+    sql = "insert into naver (Naver_ID,Naver_Name,Naver_Text,Naver_Date,Naver_link) values({},'{}','{}','{}','{}')".format(i,row['userID'],temp,row['time'],_url)
+    print(sql)
+    i = i + 1
+    curs.execute(sql)
+conn.commit()
+conn.close()
--- a/End/Twitter_Input.ipynb 0 → 100644
View file @f8a2482
+++ b/End/Twitter_Input.ipynb 0 → 100644
View file @f8a2482
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== 기본으로 설정된 트윗 수집 기간은 2019-12-08 에서 2019-12-10 까지 입니다 ===\n",
+      "=== 총 3일 간의 데이터 수집 ===\n",
+      "검색할 키워드를 입력해주세요: 이동찬\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "55c5a56d9ba7478f80d07518e22a3177",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(0,'r9OWHkiDE9EG3W9','I reached alamo for take my money back several times. They only keep showing me wrong information. Maybe ai works for it.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195691775451136')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(1,'r9OWHkiDE9EG3W9','Nope. I gave up 431 dollars already. I just want ppl not to lose their money in a pleasant place.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195233975504896')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(2,'optimum0524','IMF 환란이전 96년쯤 예금금리가 12%였던걸로 기억합니다. 80년대 코오롱그룹 이동찬 회장은 장영자에게 어음수표깡을 받으면서 50%이상의 금리를 적용받았다고하니 정말 요즘기준으로는 이해하기 힘든 시대였지요.','2019-12-10','https://twitter.com/optimum0524/status/1204190641866956801')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(3,'Naerum10','@이동찬','2019-12-09','https://twitter.com/Naerum10/status/1203920823725121537')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(4,'4bur0','이동찬 달려오는 폼이 너무 웃겨','2019-12-08','https://twitter.com/4bur0/status/1203736355584393216')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(5,'r9OWHkiDE9EG3W9','And i tried to contact to the headquarters. They only says “contact the branch”. So irresponsible and irritating reaction to customers.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517328811417600')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(6,'r9OWHkiDE9EG3W9','They told me deposit had been refunded at that time, but now i know they gave me a bullshit.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517327506993152')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(7,'r9OWHkiDE9EG3W9','If u use cash, there wouldn’t remain any record or deposit back at all.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517326278053888')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(8,'r9OWHkiDE9EG3W9','Ppl!! Warning!! At #1778 ala moana blvd, DO NOT use cash!!!','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517325023928320')\n",
+      "insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(9,'r9OWHkiDE9EG3W9','@Alamo hello. Alamo and ppl who rent a car at beautiful hawaii~','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517323283324928')\n"
+     ]
+    }
+   ],
+   "source": [
+    "import GetOldTweets3 as got\n",
+    "from bs4 import BeautifulSoup\n",
+    "import pymysql\n",
+    "import datetime\n",
+    "import time\n",
+    "from random import uniform\n",
+    "from tqdm import tqdm_notebook\n",
+    "conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')\n",
+    "curs = conn.cursor()\n",
+    "\n",
+    "def get_tweets(criteria):\n",
+    "    tweet = got.manager.TweetManager.getTweets(criteria)\n",
+    "    tweet_list = []\n",
+    "\n",
+    "    for index in tqdm_notebook(tweet):\n",
+    "\n",
+    "        # 메타데이터 목록\n",
+    "        username = index.username\n",
+    "        link = index.permalink\n",
+    "        content = index.text\n",
+    "        tweet_date = index.date.strftime(\"%Y-%m-%d\")\n",
+    "        retweets = index.retweets\n",
+    "        favorites = index.favorites\n",
+    "\n",
+    "        # 결과 합치기\n",
+    "        info_list = {'username' : username, 'text': content, 'time': tweet_date, 'link': link}\n",
+    "        tweet_list.append(info_list)\n",
+    "        # 휴식\n",
+    "        time.sleep(uniform(1,2))\n",
+    "    return tweet_list\n",
+    "days_range = []\n",
+    "\n",
+    "start = datetime.datetime.strptime(\"2019-12-08\", \"%Y-%m-%d\")\n",
+    "end = datetime.datetime.strptime(\"2019-12-11\", \"%Y-%m-%d\")\n",
+    "date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]\n",
+    "\n",
+    "for date in date_generated:\n",
+    "    days_range.append(date.strftime(\"%Y-%m-%d\"))\n",
+    "print(\"=== 기본으로 설정된 트윗 수집 기간은 {} 에서 {} 까지 입니다 ===\".format(days_range[0], days_range[-1]))\n",
+    "print(\"=== 총 {}일 간의 데이터 수집 ===\".format(len(days_range)))\n",
+    "\n",
+    "# 수집 기간 맞추기\n",
+    "start_date = days_range[0]\n",
+    "end_date = (datetime.datetime.strptime(days_range[-1], \"%Y-%m-%d\")\n",
+    "            + datetime.timedelta(days=1)).strftime(\"%Y-%m-%d\") # setUntil이 끝을 포함하지 않으므로, day + 1\n",
+    "\n",
+    "my_key = input(\"검색할 키워드를 입력해주세요: \")\n",
+    "\n",
+    "tweetCriteria = got.manager.TweetCriteria().setQuerySearch(my_key)\\\n",
+    "                                           .setSince(\"2019-12-08\")\\\n",
+    "                                           .setUntil(\"2019-12-11\")\\\n",
+    "                                           .setMaxTweets(10)\n",
+    "result_list = get_tweets(tweetCriteria)\n",
+    "\n",
+    "i = 0\n",
+    "for row in result_list : # 이름 내용 날짜 링크\n",
+    "    sql = \"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values({},'{}','{}','{}','{}')\".format(i,row['username'],row['text'],row['time'],row['link'])\n",
+    "    print(sql)\n",
+    "    i = i + 1\n",
+    "    curs.execute(sql)\n",
+    "conn.commit()\n",
+    "conn.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/End/downloader.py 0 → 100644
View file @f8a2482
+++ b/End/downloader.py 0 → 100644
View file @f8a2482
+#!/usr/bin/env python
+from __future__ import print_function
+import sys
+import os
+import time
+import json
+import requests
+import argparse
+import lxml.html
+import io
+from urllib.parse import urlparse, parse_qs
+from lxml.cssselect import CSSSelector
+YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
+YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
+USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
+def find_value(html, key, num_chars=2):
+    pos_begin = html.find(key) + len(key) + num_chars
+    pos_end = html.find('"', pos_begin)
+    return html[pos_begin: pos_end]
+def extract_comments(html):
+    tree = lxml.html.fromstring(html)
+    item_sel = CSSSelector('.comment-item')
+    text_sel = CSSSelector('.comment-text-content')
+    time_sel = CSSSelector('.time')
+    author_sel = CSSSelector('.user-name')
+    for item in item_sel(tree):
+        yield {'cid': item.get('data-cid'),
+               'text': text_sel(item)[0].text_content(),
+               'time': time_sel(item)[0].text_content().strip(),
+               'author': author_sel(item)[0].text_content()}
+def extract_reply_cids(html):
+    tree = lxml.html.fromstring(html)
+    sel = CSSSelector('.comment-replies-header > .load-comments')
+    return [i.get('data-cid') for i in sel(tree)]
+def ajax_request(session, url, params, data, retries=10, sleep=20):
+    for _ in range(retries):
+        response = session.post(url, params=params, data=data)
+        if response.status_code == 200:
+            response_dict = json.loads(response.text)
+            return response_dict.get('page_token', None), response_dict['html_content']
+        else:
+            time.sleep(sleep)
+def download_comments(youtube_id, sleep=1):
+    session = requests.Session()
+    session.headers['User-Agent'] = USER_AGENT
+    # Get Youtube page with initial comments
+    response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
+    html = response.text
+    reply_cids = extract_reply_cids(html)
+    ret_cids = []
+    for comment in extract_comments(html):
+        ret_cids.append(comment['cid'])
+        yield comment
+    page_token = find_value(html, 'data-token')
+    session_token = find_value(html, 'XSRF_TOKEN', 4)
+    first_iteration = True
+    # Get remaining comments (the same as pressing the 'Show more' button)
+    while page_token:
+        data = {'video_id': youtube_id,
+                'session_token': session_token}
+        params = {'action_load_comments': 1,
+                  'order_by_time': True,
+                  'filter': youtube_id}
+        if first_iteration:
+            params['order_menu'] = True
+        else:
+            data['page_token'] = page_token
+        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+        if not response:
+            break
+        page_token, html = response
+        reply_cids += extract_reply_cids(html)
+        for comment in extract_comments(html):
+            if comment['cid'] not in ret_cids:
+                ret_cids.append(comment['cid'])
+                yield comment
+        first_iteration = False
+        time.sleep(sleep)
+    # Get replies (the same as pressing the 'View all X replies' link)
+    for cid in reply_cids:
+        data = {'comment_id': cid,
+                'video_id': youtube_id,
+                'can_reply': 1,
+                'session_token': session_token}
+        params = {'action_load_replies': 1,
+                  'order_by_time': True,
+                  'filter': youtube_id,
+                  'tab': 'inbox'}
+        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+        if not response:
+            break
+        _, html = response
+        for comment in extract_comments(html):
+            if comment['cid'] not in ret_cids:
+                ret_cids.append(comment['cid'])
+                yield comment
+        time.sleep(sleep)
+## input video 값 parsing
+def video_id(value):
+    query = urlparse(value)
+    if query.hostname == 'youtu.be':
+        return query.path[1:]
+    if query.hostname in ('www.youtube.com', 'youtube.com'):
+        if query.path == '/watch':
+            p = parse_qs(query.query)
+            return p['v'][0]
+        if query.path[:7] == '/embed/':
+            return query.path.split('/')[2]
+        if query.path[:3] == '/v/':
+            return query.path.split('/')[2]
+    # fail?
+    return None
+def main():
+    #parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
+    #parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
+    #parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
+    #parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
+    #parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments')
+    Youtube_id1 = input('Youtube_ID 입력 :')
+    ## Cutting Link를 받고 id만 딸 수 있도록
+    Youtube_id2 = Youtube_id1
+    Youtube_id1 = video_id(Youtube_id1)
+    youtube_id = Youtube_id1
+    try:
+        # args = parser.parse_args(argv)
+        #youtube_id = args.youtubeid
+        #output = args.output
+        #limit = args.limit
+        result_List = []
+    ## input 값을 받고 값에 할당
+    ## Limit에 빈 값이 들어갈 경우 Default 값으로 100을 넣게 하였음
+        if not youtube_id :
+            #parser.print_usage()
+            #raise ValueError('you need to specify a Youtube ID and an output filename')
+            raise ValueError('올바른 입력 값을 입력하세요')
+        print('Downloading Youtube comments for video:', youtube_id)
+        Number = 1
+        if Number == '0' :
+            Output1 = input('결과를 받을 파일 입력 :')
+            Limit1 = input('제한 갯수 입력 : ')
+            if Limit1 == '' :
+                Limit1 = 100
+                Limit1 = int(Limit1)
+            limit = int(Limit1)
+            output = Output1
+                ##### argument로 받지 않고 input으로 받기 위한 것
+            with io.open(output, 'w', encoding='utf8') as fp:
+                for comment in download_comments(youtube_id):
+                    comment_json = json.dumps(comment, ensure_ascii=False)
+                    print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp)
+                    count += 1
+                    sys.stdout.flush()
+                    if limit and count >= limit:
+                        print('Downloaded {} comment(s)\r'.format(count))
+                        print('\nDone!')
+                        break
+        else :
+            count = 0
+            i = 0
+            limit = 100
+            for comment in download_comments(youtube_id):
+                dic = {}
+                dic['cid'] = comment['cid']
+                dic['text'] = str(comment['text'])
+                dic['time'] = comment['time']
+                dic['author'] = comment['author']
+                dic['link'] = Youtube_id2
+                result_List.append(dic)
+                count += 1
+                i += 1
+                if limit  == count :
+                    print(' Comment Thread 생성 완료')
+                    print ('\n\n\n\n\n\n\n')
+                    break
+        return result_List
+        #goto_Menu(result_List)
+    except Exception as e:
+        print('Error:', str(e))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()
--- a/End/schema.sql 0 → 100644
View file @f8a2482
+++ b/End/schema.sql 0 → 100644
View file @f8a2482
+-- MySQL Workbench Forward Engineering
+
+SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
+SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
+SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION';
+
+-- -----------------------------------------------------
+-- Schema os_db
+-- -----------------------------------------------------
+
+-- -----------------------------------------------------
+-- Schema os_db
+-- -----------------------------------------------------
+CREATE SCHEMA IF NOT EXISTS `os_db` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci ;
+USE `os_db` ;
+
+-- -----------------------------------------------------
+-- Table `os_db`.`naver`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `os_db`.`naver` (
+  `Naver_ID` INT(11) NOT NULL,
+  `Naver_Name` VARCHAR(6000) NULL DEFAULT NULL,
+  `Naver_Text` VARCHAR(6000) NULL DEFAULT NULL,
+  `Naver_Date` VARCHAR(100) NULL DEFAULT NULL,
+  `Naver_Link` VARCHAR(1000) NULL DEFAULT NULL,
+  PRIMARY KEY (`Naver_ID`))
+ENGINE = InnoDB
+DEFAULT CHARACTER SET = utf8;
+
+
+-- -----------------------------------------------------
+-- Table `os_db`.`twitter`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `os_db`.`twitter` (
+  `Twitter_ID` INT(11) NOT NULL,
+  `Twitter_Name` VARCHAR(6000) NULL DEFAULT NULL,
+  `Twitter_Link` VARCHAR(6000) NULL DEFAULT NULL,
+  `Twitter_Date` VARCHAR(100) NULL DEFAULT NULL,
+  `Twitter_Text` VARCHAR(6000) NULL DEFAULT NULL,
+  PRIMARY KEY (`Twitter_ID`))
+ENGINE = InnoDB
+DEFAULT CHARACTER SET = utf8;
+
+
+-- -----------------------------------------------------
+-- Table `os_db`.`user`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `os_db`.`user` (
+  `User_ID` INT(11) NOT NULL,
+  PRIMARY KEY (`User_ID`))
+ENGINE = InnoDB
+DEFAULT CHARACTER SET = utf8mb4
+COLLATE = utf8mb4_unicode_ci;
+
+
+-- -----------------------------------------------------
+-- Table `os_db`.`user_naver`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `os_db`.`user_naver` (
+  `User_ID` INT(11) NOT NULL,
+  `Naver_ID` INT(11) NOT NULL,
+  PRIMARY KEY (`User_ID`, `Naver_ID`),
+  INDEX `UN_Naver_idx` (`Naver_ID` ASC) ,
+  CONSTRAINT `UN_Naver`
+    FOREIGN KEY (`Naver_ID`)
+    REFERENCES `os_db`.`naver` (`Naver_ID`),
+  CONSTRAINT `UN_user`
+    FOREIGN KEY (`User_ID`)
+    REFERENCES `os_db`.`user` (`User_ID`))
+ENGINE = InnoDB
+DEFAULT CHARACTER SET = utf8mb4
+COLLATE = utf8mb4_unicode_ci;
+
+
+-- -----------------------------------------------------
+-- Table `os_db`.`user_twitter`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `os_db`.`user_twitter` (
+  `User_ID` INT(11) NOT NULL,
+  `Twitter_ID` INT(11) NOT NULL,
+  PRIMARY KEY (`User_ID`, `Twitter_ID`),
+  INDEX `UT_twitter_idx` (`Twitter_ID` ASC) ,
+  CONSTRAINT `UT_twitter`
+    FOREIGN KEY (`Twitter_ID`)
+    REFERENCES `os_db`.`twitter` (`Twitter_ID`),
+  CONSTRAINT `UT_user`
+    FOREIGN KEY (`User_ID`)
+    REFERENCES `os_db`.`user` (`User_ID`))
+ENGINE = InnoDB
+DEFAULT CHARACTER SET = utf8mb4
+COLLATE = utf8mb4_unicode_ci;
+
+
+-- -----------------------------------------------------
+-- Table `os_db`.`youtube`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `os_db`.`youtube` (
+  `Youtube_ID` INT(11) NOT NULL,
+  `Youtube_Text` VARCHAR(12000) NULL DEFAULT NULL,
+  `Youtube_Name` VARCHAR(6000) NULL DEFAULT NULL,
+  `Youtube_Date` VARCHAR(100) NULL DEFAULT NULL,
+  `Youtube_Link` VARCHAR(1000) NULL DEFAULT NULL,
+  PRIMARY KEY (`Youtube_ID`))
+ENGINE = MyISAM
+DEFAULT CHARACTER SET = utf8;
+
+
+-- -----------------------------------------------------
+-- Table `os_db`.`user_youtube`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `os_db`.`user_youtube` (
+  `User_ID` INT(11) NOT NULL,
+  `Youtube_ID` INT(11) NOT NULL,
+  PRIMARY KEY (`User_ID`, `Youtube_ID`),
+  INDEX `UY_youtube_idx` (`Youtube_ID` ASC) ,
+  CONSTRAINT `UY_user`
+    FOREIGN KEY (`User_ID`)
+    REFERENCES `os_db`.`user` (`User_ID`),
+  CONSTRAINT `UY_youtube`
+    FOREIGN KEY (`Youtube_ID`)
+    REFERENCES `os_db`.`youtube` (`Youtube_ID`))
+ENGINE = InnoDB
+DEFAULT CHARACTER SET = utf8mb4
+COLLATE = utf8mb4_unicode_ci;
+
+
+SET SQL_MODE=@OLD_SQL_MODE;
+SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
+SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;
--- a/End/youtube_crawl2.py 0 → 100644
View file @f8a2482
+++ b/End/youtube_crawl2.py 0 → 100644
View file @f8a2482
+import downloader
+import pymysql
+import csv
+import random
+
+conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')
+curs = conn.cursor()
+
+def call_main ():
+    print(' Comment Thread 생성중 \n')
+    print(' **************************************************************')
+    print(' **************************************************************')
+    print(' **************************************************************')
+    print(' **************** 생성 완료 정보를 입력하세요. ****************  ')
+    print(' **************************************************************')
+    print(' **************************************************************')
+    print(' **************************************************************')
+    a = downloader.main()
+    return a
+
+CommentList = call_main() ## dic 형식으로 cid, text, time, author
+i = 0
+for row in CommentList :
+    temp = row['text'].replace("'",'')
+    sql = "insert into youtube (Youtube_ID,Youtube_Text,Youtube_Date,Youtube_Name,Youtube_Link) values({},'{}','{}','{}','{}')".format(i,temp,row['time'],row['author'],row['link'])
+    print(sql)
+    i = i + 1
+    curs.execute(sql)
+conn.commit()
+conn.close()