Showing
3 changed files
with
209 additions
and
160 deletions
naverNews/main.py
deleted
100644 → 0
1 | -{ | ||
2 | - "cells": [ | ||
3 | - { | ||
4 | - "cell_type": "code", | ||
5 | - "execution_count": 2, | ||
6 | - "metadata": {}, | ||
7 | - "outputs": [ | ||
8 | - { | ||
9 | - "name": "stdout", | ||
10 | - "output_type": "stream", | ||
11 | - "text": [ | ||
12 | - "******************************\n", | ||
13 | - "\n", | ||
14 | - "\n", | ||
15 | - "***< Naver News Crawling >****\n", | ||
16 | - "\n", | ||
17 | - "\n", | ||
18 | - "******************************\n", | ||
19 | - "검색하고자 하는 url을 입력해주세요: https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175\n", | ||
20 | - "comment_list를 가져오는 중.....\n", | ||
21 | - "Message: element not interactable\n", | ||
22 | - " (Session info: chrome=78.0.3904.97)\n", | ||
23 | - "\n", | ||
24 | - "[{'userID': 'ydja****', 'comment': '옹벤져스 너무웃겨', 'time': '6일 전'}, {'userID': 'kims****', 'comment': '사랑해요 옹벤져스! 준기엄마 다리 찢을 때 웃겨죽는 줄 진짜 츤데레언니들', 'time': '6일 전'}, {'userID': 'hoho****', 'comment': '옹벤져스가 다른 마을 살인마 잡는 이야기로 시즌 2. 갑시다', 'time': '6일 전'}]\n", | ||
25 | - "comment_list를 다 가져왔습니다!\n" | ||
26 | - ] | ||
27 | - } | ||
28 | - ], | ||
29 | - "source": [ | ||
30 | - "import naverNews_crawling \n", | ||
31 | - "from time import sleep\n", | ||
32 | - "\n", | ||
33 | - "def print_cList(c_List) :\n", | ||
34 | - " for item in c_List :\n", | ||
35 | - " print(item)\n", | ||
36 | - "\n", | ||
37 | - "def search_by_author(c_List,user_ID) :\n", | ||
38 | - " result_List = []\n", | ||
39 | - " for item in c_List :\n", | ||
40 | - " print(item['userID'])\n", | ||
41 | - " if ( user_ID in item['userID']) :\n", | ||
42 | - " result_List.append(item)\n", | ||
43 | - " return result_List\n", | ||
44 | - "\n", | ||
45 | - "def search_by_keyword(c_List,keyword) :\n", | ||
46 | - " result_List = []\n", | ||
47 | - " for item in c_List :\n", | ||
48 | - " print(item['comment'])\n", | ||
49 | - " if ( keyword in item['comment']) :\n", | ||
50 | - " result_List.append(item)\n", | ||
51 | - " return result_List\n", | ||
52 | - " \n", | ||
53 | - "'''\n", | ||
54 | - "def search_by_time(c_List,_time) :\n", | ||
55 | - " result_List = []\n", | ||
56 | - " for item in c_List :\n", | ||
57 | - " print(item['time'])\n", | ||
58 | - " if ( keyword in item['comment']) :\n", | ||
59 | - " result_List.append(item)\n", | ||
60 | - " return result_List \n", | ||
61 | - " \n", | ||
62 | - "''' \n", | ||
63 | - "\n", | ||
64 | - "def main ():\n", | ||
65 | - " ## 시작화면\n", | ||
66 | - " \n", | ||
67 | - " _star = '*'\n", | ||
68 | - " print(_star.center(30,'*'))\n", | ||
69 | - " print('\\n')\n", | ||
70 | - " headString = '< Naver News Crawling >'\n", | ||
71 | - " print(headString.center(30,'*'))\n", | ||
72 | - " print('\\n')\n", | ||
73 | - " print(_star.center(30,'*'))\n", | ||
74 | - " \n", | ||
75 | - " \n", | ||
76 | - " # 검색하고자 하는 url을 입력받는다\n", | ||
77 | - " _url = input('검색하고자 하는 url을 입력해주세요: ')\n", | ||
78 | - " print('comment_list를 가져오는 중.....')\n", | ||
79 | - " cList = naverNews_crawling.getData(_url)\n", | ||
80 | - " print('comment_list를 다 가져왔습니다!')\n", | ||
81 | - "\n", | ||
82 | - "main()" | ||
83 | - ] | ||
84 | - } | ||
85 | - ], | ||
86 | - "metadata": { | ||
87 | - "kernelspec": { | ||
88 | - "display_name": "Python 3", | ||
89 | - "language": "python", | ||
90 | - "name": "python3" | ||
91 | - }, | ||
92 | - "language_info": { | ||
93 | - "codemirror_mode": { | ||
94 | - "name": "ipython", | ||
95 | - "version": 3 | ||
96 | - }, | ||
97 | - "file_extension": ".py", | ||
98 | - "mimetype": "text/x-python", | ||
99 | - "name": "python", | ||
100 | - "nbconvert_exporter": "python", | ||
101 | - "pygments_lexer": "ipython3", | ||
102 | - "version": "3.7.3" | ||
103 | - } | ||
104 | - }, | ||
105 | - "nbformat": 4, | ||
106 | - "nbformat_minor": 2 | ||
107 | -} |
... | @@ -25,3 +25,8 @@ | ... | @@ -25,3 +25,8 @@ |
25 | main 에서 사용할 수 있도록 한다. 이 후 main에서 리스트를 받아와 url을 입력받아 데이터를 | 25 | main 에서 사용할 수 있도록 한다. 이 후 main에서 리스트를 받아와 url을 입력받아 데이터를 |
26 | 받아오는 방식으로 사용한다. 이 후, keyword기반, id기반, 시간대 기반 검색 함수를 구현하였고 | 26 | 받아오는 방식으로 사용한다. 이 후, keyword기반, id기반, 시간대 기반 검색 함수를 구현하였고 |
27 | 시간대별 검색 함수의 기능 보강과 가장 자주 나온 단어 검색 기능을 추가 구현할 예정이다. | 27 | 시간대별 검색 함수의 기능 보강과 가장 자주 나온 단어 검색 기능을 추가 구현할 예정이다. |
28 | + | ||
29 | +* 4차 수정사항 | ||
30 | + | ||
31 | + 기존파일의 분라 관리 시, import관련 오류 문제 해결 완료(하나의 파일로 관리) | ||
32 | + 사용자 UI의 틀을 구축해놓았고, 곧바로 함수별 추가 세부 구현 예정 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
1 | -from selenium import webdriver | ||
2 | -from selenium.common import exceptions | ||
3 | -from bs4 import BeautifulSoup | ||
4 | -import time | ||
5 | - | ||
6 | -def getData(url): | ||
7 | - ## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>) | ||
8 | - options = webdriver.ChromeOptions() | ||
9 | - #options.add_argument('headless') | ||
10 | - #options.add_argument("disable-gpu") | ||
11 | - #_url = "https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175" # 크롤링할 URL | ||
12 | - _url = url # 크롤링할 URL | ||
13 | - webDriver = "C:\\Users\\user\\Desktop\\chromedriver_win32\\chromedriver.exe" # 내 웹드라이버 위치 | ||
14 | - driver = webdriver.Chrome(webDriver,chrome_options=options) | ||
15 | - #driver = webdriver.Chrome(webDriver) | ||
16 | - driver.get(_url) | ||
17 | - pageCnt = 0 | ||
18 | - driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함 | ||
19 | - try: | ||
20 | - while True: # 댓글 페이지 끝날때까지 돌림 | ||
21 | - #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지) | ||
22 | - driver.find_element_by_css_selector(".u_cbox_btn_more").click() | ||
23 | - pageCnt = pageCnt+1 | ||
24 | - | ||
25 | - except exceptions.ElementNotVisibleException as e: # 페이지가 끝남 | ||
26 | - pass | ||
27 | - | ||
28 | - except Exception as e: # 다른 예외 발생시 확인 | ||
29 | - print(e) | ||
30 | - | ||
31 | - pageSource = driver.page_source # 페이지 소스를 따와서 | ||
32 | - result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용 | ||
33 | - | ||
34 | - # nickname, text, time을 raw하게 뽑아온다 | ||
35 | - comments_raw = result.find_all("span", {"class" : "u_cbox_contents"}) | ||
36 | - nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"}) | ||
37 | - times_raw = result.find_all("span", {"class" : "u_cbox_date"}) | ||
38 | - | ||
39 | - # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다 | ||
40 | - comments = [comment.text for comment in comments_raw] | ||
41 | - nicknames = [nickname.text for nickname in nicknames_raw] | ||
42 | - times = [time.text for time in times_raw] | ||
43 | - | ||
44 | - naverNewsList = [] | ||
45 | - | ||
46 | - for i in range(len(comments)): | ||
47 | - info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]} | ||
48 | - naverNewsList.append(info_dic) | ||
49 | - | ||
50 | - print(naverNewsList[:3]) | ||
51 | - | ||
52 | - return naverNewsList | ||
53 | - #driver.quit() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
1 | +{ | ||
2 | + "cells": [ | ||
3 | + { | ||
4 | + "cell_type": "code", | ||
5 | + "execution_count": 3, | ||
6 | + "metadata": {}, | ||
7 | + "outputs": [ | ||
8 | + { | ||
9 | + "name": "stdout", | ||
10 | + "output_type": "stream", | ||
11 | + "text": [ | ||
12 | + "******************************\n", | ||
13 | + "\n", | ||
14 | + "\n", | ||
15 | + "***< Naver News Crawling >****\n", | ||
16 | + "\n", | ||
17 | + "\n", | ||
18 | + "******************************\n", | ||
19 | + "검색하고자 하는 url을 입력해주세요: \n", | ||
20 | + "comment_list를 가져오는 중.....\n" | ||
21 | + ] | ||
22 | + }, | ||
23 | + { | ||
24 | + "name": "stderr", | ||
25 | + "output_type": "stream", | ||
26 | + "text": [ | ||
27 | + "C:\\Users\\user\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:14: DeprecationWarning: use options instead of chrome_options\n", | ||
28 | + " \n" | ||
29 | + ] | ||
30 | + }, | ||
31 | + { | ||
32 | + "ename": "InvalidArgumentException", | ||
33 | + "evalue": "Message: invalid argument\n (Session info: chrome=78.0.3904.108)\n", | ||
34 | + "output_type": "error", | ||
35 | + "traceback": [ | ||
36 | + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | ||
37 | + "\u001b[1;31mInvalidArgumentException\u001b[0m Traceback (most recent call last)", | ||
38 | + "\u001b[1;32m<ipython-input-3-aa9195667f4b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 113\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 114\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 115\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | ||
39 | + "\u001b[1;32m<ipython-input-3-aa9195667f4b>\u001b[0m in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[0m_url\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'검색하고자 하는 url을 입력해주세요: '\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 98\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'comment_list를 가져오는 중.....'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 99\u001b[1;33m \u001b[0mcList\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetData\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_url\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 100\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'\\n'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'comment_list를 다 가져왔습니다!'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | ||
40 | + "\u001b[1;32m<ipython-input-3-aa9195667f4b>\u001b[0m in \u001b[0;36mgetData\u001b[1;34m(url)\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[0mdriver\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mChrome\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwebDriver\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mchrome_options\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;31m#driver = webdriver.Chrome(webDriver)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 16\u001b[1;33m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_url\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 17\u001b[0m \u001b[0mpageCnt\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimplicitly_wait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 페이지가 다 로드 될때까지 기다리게함\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | ||
41 | + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(self, url)\u001b[0m\n\u001b[0;32m 331\u001b[0m \u001b[0mLoads\u001b[0m \u001b[0ma\u001b[0m \u001b[0mweb\u001b[0m \u001b[0mpage\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcurrent\u001b[0m \u001b[0mbrowser\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 332\u001b[0m \"\"\"\n\u001b[1;32m--> 333\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGET\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'url'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 334\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 335\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | ||
42 | + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 322\u001b[0m response['value'] = self._unwrap_value(\n\u001b[0;32m 323\u001b[0m response.get('value', None))\n", | ||
43 | + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 240\u001b[0m \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 244\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | ||
44 | + "\u001b[1;31mInvalidArgumentException\u001b[0m: Message: invalid argument\n (Session info: chrome=78.0.3904.108)\n" | ||
45 | + ] | ||
46 | + } | ||
47 | + ], | ||
48 | + "source": [ | ||
49 | + "from selenium import webdriver\n", | ||
50 | + "from selenium.common import exceptions\n", | ||
51 | + "from bs4 import BeautifulSoup\n", | ||
52 | + "import time\n", | ||
53 | + "\n", | ||
54 | + "def getData(url):\n", | ||
55 | + " ## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>)\n", | ||
56 | + " options = webdriver.ChromeOptions()\n", | ||
57 | + " #options.add_argument('headless')\n", | ||
58 | + " #options.add_argument(\"disable-gpu\")\n", | ||
59 | + " #_url = \"https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175\" # 크롤링할 URL\n", | ||
60 | + " _url = url # 크롤링할 URL\n", | ||
61 | + " webDriver = \"C:\\\\Users\\\\user\\\\Desktop\\\\chromedriver_win32\\\\chromedriver.exe\" # 내 웹드라이버 위치\n", | ||
62 | + " driver = webdriver.Chrome(webDriver,chrome_options=options)\n", | ||
63 | + " #driver = webdriver.Chrome(webDriver)\n", | ||
64 | + " driver.get(_url)\n", | ||
65 | + " pageCnt = 0\n", | ||
66 | + " driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함\n", | ||
67 | + " try:\n", | ||
68 | + " while True: # 댓글 페이지 끝날때까지 돌림\n", | ||
69 | + " #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)\n", | ||
70 | + " driver.find_element_by_css_selector(\".u_cbox_btn_more\").click() \n", | ||
71 | + " pageCnt = pageCnt+1\n", | ||
72 | + " \n", | ||
73 | + " except exceptions.ElementNotVisibleException as e: # 페이지가 끝남\n", | ||
74 | + " pass\n", | ||
75 | + " \n", | ||
76 | + " except Exception as e: # 다른 예외 발생시 확인\n", | ||
77 | + " print(e)\n", | ||
78 | + " \n", | ||
79 | + " pageSource = driver.page_source # 페이지 소스를 따와서\n", | ||
80 | + " result = BeautifulSoup(pageSource, \"lxml\") # 빠르게 뽑아오기 위해 lxml 사용\n", | ||
81 | + "\n", | ||
82 | + " # nickname, text, time을 raw하게 뽑아온다\n", | ||
83 | + " comments_raw = result.find_all(\"span\", {\"class\" : \"u_cbox_contents\"})\n", | ||
84 | + " nicknames_raw = result.find_all(\"span\", {\"class\" : \"u_cbox_nick\"})\n", | ||
85 | + " times_raw = result.find_all(\"span\", {\"class\" : \"u_cbox_date\"})\n", | ||
86 | + "\n", | ||
87 | + " # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다\n", | ||
88 | + " comments = [comment.text for comment in comments_raw]\n", | ||
89 | + " nicknames = [nickname.text for nickname in nicknames_raw]\n", | ||
90 | + " times = [time.text for time in times_raw]\n", | ||
91 | + " \n", | ||
92 | + " naverNewsList = []\n", | ||
93 | + " \n", | ||
94 | + " for i in range(len(comments)):\n", | ||
95 | + " info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}\n", | ||
96 | + " naverNewsList.append(info_dic)\n", | ||
97 | + " \n", | ||
98 | + " return naverNewsList\n", | ||
99 | + " #driver.quit()\n", | ||
100 | + " \n", | ||
101 | + "from time import sleep\n", | ||
102 | + "\n", | ||
103 | + "def print_cList(c_List) :\n", | ||
104 | + " for item in c_List :\n", | ||
105 | + " print(item)\n", | ||
106 | + "\n", | ||
107 | + "def search_by_author(c_List,user_ID) :\n", | ||
108 | + " result_List = []\n", | ||
109 | + " for item in c_List :\n", | ||
110 | + " print(item['userID'])\n", | ||
111 | + " if ( user_ID in item['userID']) :\n", | ||
112 | + " result_List.append(item)\n", | ||
113 | + " return result_List\n", | ||
114 | + "\n", | ||
115 | + "def search_by_keyword(c_List,keyword) :\n", | ||
116 | + " result_List = []\n", | ||
117 | + " for item in c_List :\n", | ||
118 | + " print(item['comment'])\n", | ||
119 | + " if ( keyword in item['comment']) :\n", | ||
120 | + " result_List.append(item)\n", | ||
121 | + " return result_List\n", | ||
122 | + " \n", | ||
123 | + "\n", | ||
124 | + "def search_by_time(c_List,_time) :\n", | ||
125 | + " result_List = []\n", | ||
126 | + " for item in c_List :\n", | ||
127 | + " print(item['time'])\n", | ||
128 | + " if ( keyword in item['comment']) :\n", | ||
129 | + " result_List.append(item)\n", | ||
130 | + " return result_List\n", | ||
131 | + " \n", | ||
132 | + "def main ():\n", | ||
133 | + " ## 시작화면\n", | ||
134 | + " \n", | ||
135 | + " _star = '*'\n", | ||
136 | + " print(_star.center(30,'*'))\n", | ||
137 | + " print('\\n')\n", | ||
138 | + " headString = '< Naver News Crawling >'\n", | ||
139 | + " print(headString.center(30,'*'))\n", | ||
140 | + " print('\\n')\n", | ||
141 | + " print(_star.center(30,'*'))\n", | ||
142 | + " \n", | ||
143 | + " \n", | ||
144 | + " # 검색하고자 하는 url을 입력받는다\n", | ||
145 | + " _url = input('검색하고자 하는 url을 입력해주세요: ')\n", | ||
146 | + " print('comment_list를 가져오는 중.....')\n", | ||
147 | + " cList = getData(_url)\n", | ||
148 | + " print('\\n')\n", | ||
149 | + " print('comment_list를 다 가져왔습니다!')\n", | ||
150 | + " \n", | ||
151 | + " while(True):\n", | ||
152 | + " print('***********************************')\n", | ||
153 | + " print('1.닉네임 기반 검색')\n", | ||
154 | + " print('2.키워드 기반 검색')\n", | ||
155 | + " print('3.작성시간 기반 검색')\n", | ||
156 | + " menu = input('메뉴를 입력해주세요: ')\n", | ||
157 | + " \n", | ||
158 | + " if(menu == 1):\n", | ||
159 | + " print('***********************************')\n", | ||
160 | + " inputID = input('검색할 닉네임 앞 4자리를 입력해주세요: ')\n", | ||
161 | + " search_by_author(cList,inputID)\n", | ||
162 | + " elif(menu == 2):\n", | ||
163 | + " print('***********************************')\n", | ||
164 | + " inputKW = input('검색할 키워드를 입력해주세요: ')\n", | ||
165 | + " search_by_keyword(cList,inputKW)\n", | ||
166 | + " else:\n", | ||
167 | + " print('***********************************')\n", | ||
168 | + " inputTime = input('검색할 시간대를 입력해주세요: ')\n", | ||
169 | + " search_by_time(cList,inputTime)\n", | ||
170 | + "\n", | ||
171 | + " \n", | ||
172 | + "main()" | ||
173 | + ] | ||
174 | + }, | ||
175 | + { | ||
176 | + "cell_type": "code", | ||
177 | + "execution_count": null, | ||
178 | + "metadata": {}, | ||
179 | + "outputs": [], | ||
180 | + "source": [] | ||
181 | + } | ||
182 | + ], | ||
183 | + "metadata": { | ||
184 | + "kernelspec": { | ||
185 | + "display_name": "Python 3", | ||
186 | + "language": "python", | ||
187 | + "name": "python3" | ||
188 | + }, | ||
189 | + "language_info": { | ||
190 | + "codemirror_mode": { | ||
191 | + "name": "ipython", | ||
192 | + "version": 3 | ||
193 | + }, | ||
194 | + "file_extension": ".py", | ||
195 | + "mimetype": "text/x-python", | ||
196 | + "name": "python", | ||
197 | + "nbconvert_exporter": "python", | ||
198 | + "pygments_lexer": "ipython3", | ||
199 | + "version": "3.7.3" | ||
200 | + } | ||
201 | + }, | ||
202 | + "nbformat": 4, | ||
203 | + "nbformat_minor": 2 | ||
204 | +} | ... | ... |
-
Please register or login to post a comment