김건

Schema and Data set

from selenium import webdriver
from selenium.common import exceptions
from bs4 import BeautifulSoup
import time
import pymysql
conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')
curs = conn.cursor()
def getData(url):
## chrome option
options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument("disable-gpu")
_url = url # 크롤링할 URL
webDriver = "C:\\Users\\KimGun\\Desktop\\chromedriver_win32\\chromedriver.exe" # 내 웹드라이버 위치
driver = webdriver.Chrome(webDriver,chrome_options=options)
#driver = webdriver.Chrome(webDriver)
driver.get(_url)
pageCnt = 0
driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함
try:
while True: # 댓글 페이지 끝날때까지 돌림
#driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)
time.sleep(0.5)
driver.find_element_by_css_selector(".u_cbox_btn_more").click()
pageCnt = pageCnt+1
except exceptions.ElementNotVisibleException as e: # 페이지가 끝남
pass
except Exception as e: # 다른 예외 발생시 확인
print(e)
pageSource = driver.page_source # 페이지 소스를 따와서
result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용
# nickname, text, time을 raw하게 뽑아온다
comments_raw = result.find_all("span", {"class" : "u_cbox_contents"})
nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"})
times_raw = result.find_all("span", {"class" : "u_cbox_date"})
# nickname, text, time 값 만을 뽑아내어 리스트로 정리한다
comments = [comment.text for comment in comments_raw]
nicknames = [nickname.text for nickname in nicknames_raw]
times = [time.text for time in times_raw]
naverNewsList = []
for i in range(len(comments)):
info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}
naverNewsList.append(info_dic)
print(naverNewsList)
return naverNewsList
#driver.quit()
_url = input('검색하고자 하는 url을 입력해주세요: ')
print('comment_list를 가져오는 중.....')
cList = getData(_url)
i = 194
for row in cList : ## Name, Text, time
temp = row['comment'].replace("'",'')
sql = "insert into naver (Naver_ID,Naver_Name,Naver_Text,Naver_Date,Naver_link) values({},'{}','{}','{}','{}')".format(i,row['userID'],temp,row['time'],_url)
print(sql)
i = i + 1
curs.execute(sql)
conn.commit()
conn.close()
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== 기본으로 설정된 트윗 수집 기간은 2019-12-08 에서 2019-12-10 까지 입니다 ===\n",
"=== 총 3일 간의 데이터 수집 ===\n",
"검색할 키워드를 입력해주세요: 이동찬\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "55c5a56d9ba7478f80d07518e22a3177",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(0,'r9OWHkiDE9EG3W9','I reached alamo for take my money back several times. They only keep showing me wrong information. Maybe ai works for it.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195691775451136')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(1,'r9OWHkiDE9EG3W9','Nope. I gave up 431 dollars already. I just want ppl not to lose their money in a pleasant place.','2019-12-10','https://twitter.com/r9OWHkiDE9EG3W9/status/1204195233975504896')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(2,'optimum0524','IMF 환란이전 96년쯤 예금금리가 12%였던걸로 기억합니다. 80년대 코오롱그룹 이동찬 회장은 장영자에게 어음수표깡을 받으면서 50%이상의 금리를 적용받았다고하니 정말 요즘기준으로는 이해하기 힘든 시대였지요.','2019-12-10','https://twitter.com/optimum0524/status/1204190641866956801')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(3,'Naerum10','@이동찬','2019-12-09','https://twitter.com/Naerum10/status/1203920823725121537')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(4,'4bur0','이동찬 달려오는 폼이 너무 웃겨','2019-12-08','https://twitter.com/4bur0/status/1203736355584393216')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(5,'r9OWHkiDE9EG3W9','And i tried to contact to the headquarters. They only says “contact the branch”. So irresponsible and irritating reaction to customers.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517328811417600')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(6,'r9OWHkiDE9EG3W9','They told me deposit had been refunded at that time, but now i know they gave me a bullshit.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517327506993152')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(7,'r9OWHkiDE9EG3W9','If u use cash, there wouldn’t remain any record or deposit back at all.','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517326278053888')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(8,'r9OWHkiDE9EG3W9','Ppl!! Warning!! At #1778 ala moana blvd, DO NOT use cash!!!','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517325023928320')\n",
"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values(9,'r9OWHkiDE9EG3W9','@Alamo hello. Alamo and ppl who rent a car at beautiful hawaii~','2019-12-08','https://twitter.com/r9OWHkiDE9EG3W9/status/1203517323283324928')\n"
]
}
],
"source": [
"import GetOldTweets3 as got\n",
"from bs4 import BeautifulSoup\n",
"import pymysql\n",
"import datetime\n",
"import time\n",
"from random import uniform\n",
"from tqdm import tqdm_notebook\n",
"conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')\n",
"curs = conn.cursor()\n",
"\n",
"def get_tweets(criteria):\n",
" tweet = got.manager.TweetManager.getTweets(criteria)\n",
" tweet_list = []\n",
"\n",
" for index in tqdm_notebook(tweet):\n",
"\n",
" # 메타데이터 목록\n",
" username = index.username\n",
" link = index.permalink\n",
" content = index.text\n",
" tweet_date = index.date.strftime(\"%Y-%m-%d\")\n",
" retweets = index.retweets\n",
" favorites = index.favorites\n",
"\n",
" # 결과 합치기\n",
" info_list = {'username' : username, 'text': content, 'time': tweet_date, 'link': link}\n",
" tweet_list.append(info_list)\n",
" # 휴식\n",
" time.sleep(uniform(1,2))\n",
" return tweet_list\n",
"days_range = []\n",
"\n",
"start = datetime.datetime.strptime(\"2019-12-08\", \"%Y-%m-%d\")\n",
"end = datetime.datetime.strptime(\"2019-12-11\", \"%Y-%m-%d\")\n",
"date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]\n",
"\n",
"for date in date_generated:\n",
" days_range.append(date.strftime(\"%Y-%m-%d\"))\n",
"print(\"=== 기본으로 설정된 트윗 수집 기간은 {} 에서 {} 까지 입니다 ===\".format(days_range[0], days_range[-1]))\n",
"print(\"=== 총 {}일 간의 데이터 수집 ===\".format(len(days_range)))\n",
"\n",
"# 수집 기간 맞추기\n",
"start_date = days_range[0]\n",
"end_date = (datetime.datetime.strptime(days_range[-1], \"%Y-%m-%d\")\n",
" + datetime.timedelta(days=1)).strftime(\"%Y-%m-%d\") # setUntil이 끝을 포함하지 않으므로, day + 1\n",
"\n",
"my_key = input(\"검색할 키워드를 입력해주세요: \")\n",
"\n",
"tweetCriteria = got.manager.TweetCriteria().setQuerySearch(my_key)\\\n",
" .setSince(\"2019-12-08\")\\\n",
" .setUntil(\"2019-12-11\")\\\n",
" .setMaxTweets(10)\n",
"result_list = get_tweets(tweetCriteria)\n",
"\n",
"i = 0\n",
"for row in result_list : # 이름 내용 날짜 링크\n",
" sql = \"insert into twitter (Twitter_ID,Twitter_Name,Twitter_Text,Twitter_Date,Twitter_Link) values({},'{}','{}','{}','{}')\".format(i,row['username'],row['text'],row['time'],row['link'])\n",
" print(sql)\n",
" i = i + 1\n",
" curs.execute(sql)\n",
"conn.commit()\n",
"conn.close()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
#!/usr/bin/env python
from __future__ import print_function
import sys
import os
import time
import json
import requests
import argparse
import lxml.html
import io
from urllib.parse import urlparse, parse_qs
from lxml.cssselect import CSSSelector
YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
def find_value(html, key, num_chars=2):
pos_begin = html.find(key) + len(key) + num_chars
pos_end = html.find('"', pos_begin)
return html[pos_begin: pos_end]
def extract_comments(html):
tree = lxml.html.fromstring(html)
item_sel = CSSSelector('.comment-item')
text_sel = CSSSelector('.comment-text-content')
time_sel = CSSSelector('.time')
author_sel = CSSSelector('.user-name')
for item in item_sel(tree):
yield {'cid': item.get('data-cid'),
'text': text_sel(item)[0].text_content(),
'time': time_sel(item)[0].text_content().strip(),
'author': author_sel(item)[0].text_content()}
def extract_reply_cids(html):
tree = lxml.html.fromstring(html)
sel = CSSSelector('.comment-replies-header > .load-comments')
return [i.get('data-cid') for i in sel(tree)]
def ajax_request(session, url, params, data, retries=10, sleep=20):
for _ in range(retries):
response = session.post(url, params=params, data=data)
if response.status_code == 200:
response_dict = json.loads(response.text)
return response_dict.get('page_token', None), response_dict['html_content']
else:
time.sleep(sleep)
def download_comments(youtube_id, sleep=1):
session = requests.Session()
session.headers['User-Agent'] = USER_AGENT
# Get Youtube page with initial comments
response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
html = response.text
reply_cids = extract_reply_cids(html)
ret_cids = []
for comment in extract_comments(html):
ret_cids.append(comment['cid'])
yield comment
page_token = find_value(html, 'data-token')
session_token = find_value(html, 'XSRF_TOKEN', 4)
first_iteration = True
# Get remaining comments (the same as pressing the 'Show more' button)
while page_token:
data = {'video_id': youtube_id,
'session_token': session_token}
params = {'action_load_comments': 1,
'order_by_time': True,
'filter': youtube_id}
if first_iteration:
params['order_menu'] = True
else:
data['page_token'] = page_token
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
page_token, html = response
reply_cids += extract_reply_cids(html)
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
first_iteration = False
time.sleep(sleep)
# Get replies (the same as pressing the 'View all X replies' link)
for cid in reply_cids:
data = {'comment_id': cid,
'video_id': youtube_id,
'can_reply': 1,
'session_token': session_token}
params = {'action_load_replies': 1,
'order_by_time': True,
'filter': youtube_id,
'tab': 'inbox'}
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
_, html = response
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
time.sleep(sleep)
## input video 값 parsing
def video_id(value):
query = urlparse(value)
if query.hostname == 'youtu.be':
return query.path[1:]
if query.hostname in ('www.youtube.com', 'youtube.com'):
if query.path == '/watch':
p = parse_qs(query.query)
return p['v'][0]
if query.path[:7] == '/embed/':
return query.path.split('/')[2]
if query.path[:3] == '/v/':
return query.path.split('/')[2]
# fail?
return None
def main():
#parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
#parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
#parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
#parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
#parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments')
Youtube_id1 = input('Youtube_ID 입력 :')
## Cutting Link를 받고 id만 딸 수 있도록
Youtube_id2 = Youtube_id1
Youtube_id1 = video_id(Youtube_id1)
youtube_id = Youtube_id1
try:
# args = parser.parse_args(argv)
#youtube_id = args.youtubeid
#output = args.output
#limit = args.limit
result_List = []
## input 값을 받고 값에 할당
## Limit에 빈 값이 들어갈 경우 Default 값으로 100을 넣게 하였음
if not youtube_id :
#parser.print_usage()
#raise ValueError('you need to specify a Youtube ID and an output filename')
raise ValueError('올바른 입력 값을 입력하세요')
print('Downloading Youtube comments for video:', youtube_id)
Number = 1
if Number == '0' :
Output1 = input('결과를 받을 파일 입력 :')
Limit1 = input('제한 갯수 입력 : ')
if Limit1 == '' :
Limit1 = 100
Limit1 = int(Limit1)
limit = int(Limit1)
output = Output1
##### argument로 받지 않고 input으로 받기 위한 것
with io.open(output, 'w', encoding='utf8') as fp:
for comment in download_comments(youtube_id):
comment_json = json.dumps(comment, ensure_ascii=False)
print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp)
count += 1
sys.stdout.flush()
if limit and count >= limit:
print('Downloaded {} comment(s)\r'.format(count))
print('\nDone!')
break
else :
count = 0
i = 0
limit = 100
for comment in download_comments(youtube_id):
dic = {}
dic['cid'] = comment['cid']
dic['text'] = str(comment['text'])
dic['time'] = comment['time']
dic['author'] = comment['author']
dic['link'] = Youtube_id2
result_List.append(dic)
count += 1
i += 1
if limit == count :
print(' Comment Thread 생성 완료')
print ('\n\n\n\n\n\n\n')
break
return result_List
#goto_Menu(result_List)
except Exception as e:
print('Error:', str(e))
sys.exit(1)
if __name__ == "__main__":
main()
-- MySQL Workbench Forward Engineering
SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION';
-- -----------------------------------------------------
-- Schema os_db
-- -----------------------------------------------------
-- -----------------------------------------------------
-- Schema os_db
-- -----------------------------------------------------
CREATE SCHEMA IF NOT EXISTS `os_db` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci ;
USE `os_db` ;
-- -----------------------------------------------------
-- Table `os_db`.`naver`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `os_db`.`naver` (
`Naver_ID` INT(11) NOT NULL,
`Naver_Name` VARCHAR(6000) NULL DEFAULT NULL,
`Naver_Text` VARCHAR(6000) NULL DEFAULT NULL,
`Naver_Date` VARCHAR(100) NULL DEFAULT NULL,
`Naver_Link` VARCHAR(1000) NULL DEFAULT NULL,
PRIMARY KEY (`Naver_ID`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = utf8;
-- -----------------------------------------------------
-- Table `os_db`.`twitter`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `os_db`.`twitter` (
`Twitter_ID` INT(11) NOT NULL,
`Twitter_Name` VARCHAR(6000) NULL DEFAULT NULL,
`Twitter_Link` VARCHAR(6000) NULL DEFAULT NULL,
`Twitter_Date` VARCHAR(100) NULL DEFAULT NULL,
`Twitter_Text` VARCHAR(6000) NULL DEFAULT NULL,
PRIMARY KEY (`Twitter_ID`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = utf8;
-- -----------------------------------------------------
-- Table `os_db`.`user`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `os_db`.`user` (
`User_ID` INT(11) NOT NULL,
PRIMARY KEY (`User_ID`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = utf8mb4
COLLATE = utf8mb4_unicode_ci;
-- -----------------------------------------------------
-- Table `os_db`.`user_naver`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `os_db`.`user_naver` (
`User_ID` INT(11) NOT NULL,
`Naver_ID` INT(11) NOT NULL,
PRIMARY KEY (`User_ID`, `Naver_ID`),
INDEX `UN_Naver_idx` (`Naver_ID` ASC) ,
CONSTRAINT `UN_Naver`
FOREIGN KEY (`Naver_ID`)
REFERENCES `os_db`.`naver` (`Naver_ID`),
CONSTRAINT `UN_user`
FOREIGN KEY (`User_ID`)
REFERENCES `os_db`.`user` (`User_ID`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = utf8mb4
COLLATE = utf8mb4_unicode_ci;
-- -----------------------------------------------------
-- Table `os_db`.`user_twitter`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `os_db`.`user_twitter` (
`User_ID` INT(11) NOT NULL,
`Twitter_ID` INT(11) NOT NULL,
PRIMARY KEY (`User_ID`, `Twitter_ID`),
INDEX `UT_twitter_idx` (`Twitter_ID` ASC) ,
CONSTRAINT `UT_twitter`
FOREIGN KEY (`Twitter_ID`)
REFERENCES `os_db`.`twitter` (`Twitter_ID`),
CONSTRAINT `UT_user`
FOREIGN KEY (`User_ID`)
REFERENCES `os_db`.`user` (`User_ID`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = utf8mb4
COLLATE = utf8mb4_unicode_ci;
-- -----------------------------------------------------
-- Table `os_db`.`youtube`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `os_db`.`youtube` (
`Youtube_ID` INT(11) NOT NULL,
`Youtube_Text` VARCHAR(12000) NULL DEFAULT NULL,
`Youtube_Name` VARCHAR(6000) NULL DEFAULT NULL,
`Youtube_Date` VARCHAR(100) NULL DEFAULT NULL,
`Youtube_Link` VARCHAR(1000) NULL DEFAULT NULL,
PRIMARY KEY (`Youtube_ID`))
ENGINE = MyISAM
DEFAULT CHARACTER SET = utf8;
-- -----------------------------------------------------
-- Table `os_db`.`user_youtube`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `os_db`.`user_youtube` (
`User_ID` INT(11) NOT NULL,
`Youtube_ID` INT(11) NOT NULL,
PRIMARY KEY (`User_ID`, `Youtube_ID`),
INDEX `UY_youtube_idx` (`Youtube_ID` ASC) ,
CONSTRAINT `UY_user`
FOREIGN KEY (`User_ID`)
REFERENCES `os_db`.`user` (`User_ID`),
CONSTRAINT `UY_youtube`
FOREIGN KEY (`Youtube_ID`)
REFERENCES `os_db`.`youtube` (`Youtube_ID`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = utf8mb4
COLLATE = utf8mb4_unicode_ci;
SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;
import downloader
import pymysql
import csv
import random
conn = pymysql.connect(host = 'database-1.cg0acc768it6.us-east-1.rds.amazonaws.com', user = 'admin', password ='41545737!',db= 'os_db',charset = 'utf8')
curs = conn.cursor()
def call_main ():
print(' Comment Thread 생성중 \n')
print(' **************************************************************')
print(' **************************************************************')
print(' **************************************************************')
print(' **************** 생성 완료 정보를 입력하세요. **************** ')
print(' **************************************************************')
print(' **************************************************************')
print(' **************************************************************')
a = downloader.main()
return a
CommentList = call_main() ## dic 형식으로 cid, text, time, author
i = 0
for row in CommentList :
temp = row['text'].replace("'",'')
sql = "insert into youtube (Youtube_ID,Youtube_Text,Youtube_Date,Youtube_Name,Youtube_Link) values({},'{}','{}','{}','{}')".format(i,temp,row['time'],row['author'],row['link'])
print(sql)
i = i + 1
curs.execute(sql)
conn.commit()
conn.close()