Showing
2 changed files
with
237 additions
and
205 deletions
... | @@ -28,5 +28,14 @@ | ... | @@ -28,5 +28,14 @@ |
28 | 28 | ||
29 | * 4차 수정사항 | 29 | * 4차 수정사항 |
30 | 30 | ||
31 | - 기존파일의 분라 관리 시, import관련 오류 문제 해결 완료(하나의 파일로 관리) | 31 | + 기존파일의 분리 관리 시, import관련 오류 문제 해결 완료(하나의 파일로 관리) |
32 | 사용자 UI의 틀을 구축해놓았고, 곧바로 함수별 추가 세부 구현 예정 | 32 | 사용자 UI의 틀을 구축해놓았고, 곧바로 함수별 추가 세부 구현 예정 |
33 | + | ||
34 | +* 5차 수정사항 | ||
35 | + | ||
36 | + 1) 네이버 댓글공간엑서 받아온 날짜 정보를 YYYY-MM-DD형식으로 바꿈. ('방금 전, 몇 분 전, 몇 시간 전, 몇 일 전'의 경우를 처리하기 위해 dateTime과 timeDelta 모듈을 활용하여 | ||
37 | + 현재 날짜를 기준으로 계산하여 YYYY-MM-DD로 저장될 수 있도록 | ||
38 | + 코드 추가) | ||
39 | + 2) 시간대별로 (시작시간, 끝시간)을 입력하여 그 시간에 해당하는 기사를 출력해주는 함수 구현 | ||
40 | + | ||
41 | + 가장 자주 많이 나온 단어 검색과 MATPLOTLIB을 활용한 시각적 표현 구현 예정 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
1 | -{ | 1 | +from selenium import webdriver |
2 | - "cells": [ | 2 | +from selenium.common import exceptions |
3 | - { | 3 | +from bs4 import BeautifulSoup |
4 | - "cell_type": "code", | 4 | +from datetime import datetime, timedelta |
5 | - "execution_count": 3, | 5 | +import time |
6 | - "metadata": {}, | 6 | + |
7 | - "outputs": [ | 7 | + |
8 | - { | 8 | +def getData(url): |
9 | - "name": "stdout", | 9 | + ## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>) |
10 | - "output_type": "stream", | 10 | + options = webdriver.ChromeOptions() |
11 | - "text": [ | 11 | + #options.add_argument('headless') |
12 | - "******************************\n", | 12 | + #options.add_argument("disable-gpu") |
13 | - "\n", | 13 | + #_url = "https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175" # 크롤링할 URL |
14 | - "\n", | 14 | + _url = url # 크롤링할 URL |
15 | - "***< Naver News Crawling >****\n", | 15 | + webDriver = "C:\\Users\\user\\Desktop\\chromedriver_win32\\chromedriver.exe" # 내 웹드라이버 위치 |
16 | - "\n", | 16 | + driver = webdriver.Chrome(webDriver,chrome_options=options) |
17 | - "\n", | 17 | + #driver = webdriver.Chrome(webDriver) |
18 | - "******************************\n", | 18 | + driver.get(_url) |
19 | - "검색하고자 하는 url을 입력해주세요: \n", | 19 | + pageCnt = 0 |
20 | - "comment_list를 가져오는 중.....\n" | 20 | + driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함 |
21 | - ] | 21 | + try: |
22 | - }, | 22 | + while True: # 댓글 페이지 끝날때까지 돌림 |
23 | - { | 23 | + #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지) |
24 | - "name": "stderr", | 24 | + driver.find_element_by_css_selector(".u_cbox_btn_more").click() |
25 | - "output_type": "stream", | 25 | + pageCnt = pageCnt+1 |
26 | - "text": [ | 26 | + |
27 | - "C:\\Users\\user\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:14: DeprecationWarning: use options instead of chrome_options\n", | 27 | + except exceptions.ElementNotVisibleException as e: # 페이지가 끝남 |
28 | - " \n" | 28 | + pass |
29 | - ] | 29 | + |
30 | - }, | 30 | + except Exception as e: # 다른 예외 발생시 확인 |
31 | - { | 31 | + print(e) |
32 | - "ename": "InvalidArgumentException", | 32 | + |
33 | - "evalue": "Message: invalid argument\n (Session info: chrome=78.0.3904.108)\n", | 33 | + pageSource = driver.page_source # 페이지 소스를 따와서 |
34 | - "output_type": "error", | 34 | + result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용 |
35 | - "traceback": [ | 35 | + |
36 | - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | 36 | + # nickname, text, time을 raw하게 뽑아온다 |
37 | - "\u001b[1;31mInvalidArgumentException\u001b[0m Traceback (most recent call last)", | 37 | + comments_raw = result.find_all("span", {"class" : "u_cbox_contents"}) |
38 | - "\u001b[1;32m<ipython-input-3-aa9195667f4b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 113\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 114\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 115\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | 38 | + nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"}) |
39 | - "\u001b[1;32m<ipython-input-3-aa9195667f4b>\u001b[0m in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[0m_url\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'검색하고자 하는 url을 입력해주세요: '\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 98\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'comment_list를 가져오는 중.....'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 99\u001b[1;33m \u001b[0mcList\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetData\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_url\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 100\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'\\n'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'comment_list를 다 가져왔습니다!'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | 39 | + times_raw = result.find_all("span", {"class" : "u_cbox_date"}) |
40 | - "\u001b[1;32m<ipython-input-3-aa9195667f4b>\u001b[0m in \u001b[0;36mgetData\u001b[1;34m(url)\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[0mdriver\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mChrome\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwebDriver\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mchrome_options\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;31m#driver = webdriver.Chrome(webDriver)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 16\u001b[1;33m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_url\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 17\u001b[0m \u001b[0mpageCnt\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimplicitly_wait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 페이지가 다 로드 될때까지 기다리게함\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | 40 | + |
41 | - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(self, url)\u001b[0m\n\u001b[0;32m 331\u001b[0m \u001b[0mLoads\u001b[0m \u001b[0ma\u001b[0m \u001b[0mweb\u001b[0m \u001b[0mpage\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcurrent\u001b[0m \u001b[0mbrowser\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 332\u001b[0m \"\"\"\n\u001b[1;32m--> 333\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGET\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'url'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 334\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 335\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | 41 | + # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다 |
42 | - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 322\u001b[0m response['value'] = self._unwrap_value(\n\u001b[0;32m 323\u001b[0m response.get('value', None))\n", | 42 | + comments = [comment.text for comment in comments_raw] |
43 | - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 240\u001b[0m \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 244\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | 43 | + nicknames = [nickname.text for nickname in nicknames_raw] |
44 | - "\u001b[1;31mInvalidArgumentException\u001b[0m: Message: invalid argument\n (Session info: chrome=78.0.3904.108)\n" | 44 | + times = [time.text for time in times_raw] |
45 | - ] | 45 | + |
46 | - } | 46 | + naverNewsList = [] |
47 | - ], | 47 | + |
48 | - "source": [ | 48 | + for i in range(len(comments)): |
49 | - "from selenium import webdriver\n", | 49 | + info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]} |
50 | - "from selenium.common import exceptions\n", | 50 | + naverNewsList.append(info_dic) |
51 | - "from bs4 import BeautifulSoup\n", | 51 | + |
52 | - "import time\n", | 52 | + return naverNewsList |
53 | - "\n", | 53 | + #driver.quit() |
54 | - "def getData(url):\n", | 54 | + |
55 | - " ## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>)\n", | 55 | +from time import sleep |
56 | - " options = webdriver.ChromeOptions()\n", | 56 | + |
57 | - " #options.add_argument('headless')\n", | 57 | +def print_cList(c_List) : |
58 | - " #options.add_argument(\"disable-gpu\")\n", | 58 | + for item in c_List : |
59 | - " #_url = \"https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175\" # 크롤링할 URL\n", | 59 | + print(item) |
60 | - " _url = url # 크롤링할 URL\n", | 60 | + |
61 | - " webDriver = \"C:\\\\Users\\\\user\\\\Desktop\\\\chromedriver_win32\\\\chromedriver.exe\" # 내 웹드라이버 위치\n", | 61 | +def search_by_author(c_List,user_ID) : |
62 | - " driver = webdriver.Chrome(webDriver,chrome_options=options)\n", | 62 | + result_List = [] |
63 | - " #driver = webdriver.Chrome(webDriver)\n", | 63 | + for item in c_List : |
64 | - " driver.get(_url)\n", | 64 | + #print(item['userID']) |
65 | - " pageCnt = 0\n", | 65 | + if ( user_ID in item['userID']) : |
66 | - " driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함\n", | 66 | + result_List.append(item) |
67 | - " try:\n", | 67 | + return result_List |
68 | - " while True: # 댓글 페이지 끝날때까지 돌림\n", | 68 | + |
69 | - " #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)\n", | 69 | +def search_by_keyword(c_List,keyword) : |
70 | - " driver.find_element_by_css_selector(\".u_cbox_btn_more\").click() \n", | 70 | + result_List = [] |
71 | - " pageCnt = pageCnt+1\n", | 71 | + for item in c_List : |
72 | - " \n", | 72 | + #print(item['comment']) |
73 | - " except exceptions.ElementNotVisibleException as e: # 페이지가 끝남\n", | 73 | + if ( keyword in item['comment']) : |
74 | - " pass\n", | 74 | + result_List.append(item) |
75 | - " \n", | 75 | + return result_List |
76 | - " except Exception as e: # 다른 예외 발생시 확인\n", | 76 | + |
77 | - " print(e)\n", | 77 | +def refine_time(c_List): # 시간에서 몇일 전, 몇 분 전, 방금 전 등의 형태를 YYYY.MM.DD로 바꿔준다 |
78 | - " \n", | 78 | + now = datetime.now() |
79 | - " pageSource = driver.page_source # 페이지 소스를 따와서\n", | 79 | + |
80 | - " result = BeautifulSoup(pageSource, \"lxml\") # 빠르게 뽑아오기 위해 lxml 사용\n", | 80 | + for item in c_List: |
81 | - "\n", | 81 | + if (item['time'].find('전') != -1): # ~~전이 있으면 |
82 | - " # nickname, text, time을 raw하게 뽑아온다\n", | 82 | + if (item['time'].find('일 전') != -1): # ~일 전이라면 |
83 | - " comments_raw = result.find_all(\"span\", {\"class\" : \"u_cbox_contents\"})\n", | 83 | + _day = -(int)(item['time'][0]) # 몇 일전인지에 대한 정수형 변수 |
84 | - " nicknames_raw = result.find_all(\"span\", {\"class\" : \"u_cbox_nick\"})\n", | 84 | + tempTime = now + timedelta(days=_day) |
85 | - " times_raw = result.find_all(\"span\", {\"class\" : \"u_cbox_date\"})\n", | 85 | + item['time'] = str(tempTime) |
86 | - "\n", | 86 | + item['time'] = item['time'][0:10] |
87 | - " # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다\n", | 87 | + continue |
88 | - " comments = [comment.text for comment in comments_raw]\n", | 88 | + elif (item['time'].find('시간 전') != -1): |
89 | - " nicknames = [nickname.text for nickname in nicknames_raw]\n", | 89 | + _index = item['time'].index('시') |
90 | - " times = [time.text for time in times_raw]\n", | 90 | + _time = -(int)(item['time'][0:_index]) # 몇 시간 전인지에 대한 정수형 변수 |
91 | - " \n", | 91 | + tempTime = now + timedelta(hours = _time) |
92 | - " naverNewsList = []\n", | 92 | + item['time'] = str(tempTime) |
93 | - " \n", | 93 | + item['time'] = item['time'][0:10] |
94 | - " for i in range(len(comments)):\n", | 94 | + continue |
95 | - " info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}\n", | 95 | + elif (item['time'].find('분 전') != -1): |
96 | - " naverNewsList.append(info_dic)\n", | 96 | + _index = item['time'].index('분') |
97 | - " \n", | 97 | + _minute = -(int)(item['time'][0:_index]) # 몇 분 전인지에 대한 정수형 변수 |
98 | - " return naverNewsList\n", | 98 | + tempTime = now + timedelta(minutes = _minute) |
99 | - " #driver.quit()\n", | 99 | + item['time'] = str(tempTime) |
100 | - " \n", | 100 | + item['time'] = item['time'][0:10] |
101 | - "from time import sleep\n", | 101 | + continue |
102 | - "\n", | 102 | + elif (item['time'].find('방금 전') != -1): |
103 | - "def print_cList(c_List) :\n", | 103 | + tempTime = now |
104 | - " for item in c_List :\n", | 104 | + item['time'] = str(tempTime) |
105 | - " print(item)\n", | 105 | + item['time'] = item['time'][0:10] |
106 | - "\n", | 106 | + continue |
107 | - "def search_by_author(c_List,user_ID) :\n", | 107 | + else: |
108 | - " result_List = []\n", | 108 | + item['time'] = item['time'][0:10] |
109 | - " for item in c_List :\n", | 109 | + continue |
110 | - " print(item['userID'])\n", | 110 | + |
111 | - " if ( user_ID in item['userID']) :\n", | 111 | + |
112 | - " result_List.append(item)\n", | 112 | + |
113 | - " return result_List\n", | 113 | + |
114 | - "\n", | 114 | + |
115 | - "def search_by_keyword(c_List,keyword) :\n", | 115 | +def search_by_time(c_List,startTime, endTime) : |
116 | - " result_List = []\n", | 116 | + result_List = [] |
117 | - " for item in c_List :\n", | 117 | + |
118 | - " print(item['comment'])\n", | 118 | + startYear = int(startTime[0:4]) |
119 | - " if ( keyword in item['comment']) :\n", | 119 | + |
120 | - " result_List.append(item)\n", | 120 | + if (int(startTime[5]) == 0): # 한자리의 월일 때 |
121 | - " return result_List\n", | 121 | + startMonth = int(startTime[6]) |
122 | - " \n", | 122 | + else: |
123 | - "\n", | 123 | + startMonth = int(startTime[5:7]) |
124 | - "def search_by_time(c_List,_time) :\n", | 124 | + |
125 | - " result_List = []\n", | 125 | + if (int(startTime[8]) == 0): # 한자리의 일일 때 |
126 | - " for item in c_List :\n", | 126 | + startDay = int(startTime[9]) |
127 | - " print(item['time'])\n", | 127 | + else: |
128 | - " if ( keyword in item['comment']) :\n", | 128 | + startDay = int(startTime[8:10]) |
129 | - " result_List.append(item)\n", | 129 | + |
130 | - " return result_List\n", | 130 | + |
131 | - " \n", | 131 | + |
132 | - "def main ():\n", | 132 | + endYear = int(endTime[0:4]) |
133 | - " ## 시작화면\n", | 133 | + |
134 | - " \n", | 134 | + if (int(endTime[5]) == 0): # 한자리의 월일 때 |
135 | - " _star = '*'\n", | 135 | + endMonth = int(endTime[6]) |
136 | - " print(_star.center(30,'*'))\n", | 136 | + else: |
137 | - " print('\\n')\n", | 137 | + endMonth = int(endTime[5:7]) |
138 | - " headString = '< Naver News Crawling >'\n", | 138 | + |
139 | - " print(headString.center(30,'*'))\n", | 139 | + if (int(endTime[8]) == 0): # 한자리의 일일 때 |
140 | - " print('\\n')\n", | 140 | + endDay = int(endTime[9]) |
141 | - " print(_star.center(30,'*'))\n", | 141 | + else: |
142 | - " \n", | 142 | + endDay = int(endTime[8:10]) |
143 | - " \n", | 143 | + |
144 | - " # 검색하고자 하는 url을 입력받는다\n", | 144 | + for item in c_List: |
145 | - " _url = input('검색하고자 하는 url을 입력해주세요: ')\n", | 145 | + itemYear = int(item['time'][0:4]) |
146 | - " print('comment_list를 가져오는 중.....')\n", | 146 | + |
147 | - " cList = getData(_url)\n", | 147 | + if (int(item['time'][5]) == 0): # 한자리의 월일 때 |
148 | - " print('\\n')\n", | 148 | + itemMonth = int(item['time'][6]) |
149 | - " print('comment_list를 다 가져왔습니다!')\n", | 149 | + else: |
150 | - " \n", | 150 | + itemMonth = int(item['time'][5:7]) |
151 | - " while(True):\n", | 151 | + |
152 | - " print('***********************************')\n", | 152 | + if (int(item['time'][8]) == 0): # 한자리의 일일 때 |
153 | - " print('1.닉네임 기반 검색')\n", | 153 | + itemDay = int(item['time'][9]) |
154 | - " print('2.키워드 기반 검색')\n", | 154 | + else: |
155 | - " print('3.작성시간 기반 검색')\n", | 155 | + itemDay = int(item['time'][8:10]) |
156 | - " menu = input('메뉴를 입력해주세요: ')\n", | 156 | + |
157 | - " \n", | 157 | + if (itemYear >= startYear and itemYear <= endYear): |
158 | - " if(menu == 1):\n", | 158 | + if (itemMonth >= startMonth and itemMonth <= endMonth): |
159 | - " print('***********************************')\n", | 159 | + if(itemDay >= startDay and itemDay <= endDay): |
160 | - " inputID = input('검색할 닉네임 앞 4자리를 입력해주세요: ')\n", | 160 | + result_List.append(item) |
161 | - " search_by_author(cList,inputID)\n", | 161 | + |
162 | - " elif(menu == 2):\n", | 162 | + return result_List |
163 | - " print('***********************************')\n", | 163 | + |
164 | - " inputKW = input('검색할 키워드를 입력해주세요: ')\n", | 164 | +def printResult(c_List): |
165 | - " search_by_keyword(cList,inputKW)\n", | 165 | + for i in range(0,len(c_List)): |
166 | - " else:\n", | 166 | + print(c_List[i]) |
167 | - " print('***********************************')\n", | 167 | + |
168 | - " inputTime = input('검색할 시간대를 입력해주세요: ')\n", | 168 | +def main (): |
169 | - " search_by_time(cList,inputTime)\n", | 169 | + ## 시작화면 |
170 | - "\n", | 170 | + |
171 | - " \n", | 171 | + _star = '*' |
172 | - "main()" | 172 | + print(_star.center(30,'*')) |
173 | - ] | 173 | + print('\n') |
174 | - }, | 174 | + headString = '< Naver News Crawling >' |
175 | - { | 175 | + print(headString.center(30,'*')) |
176 | - "cell_type": "code", | 176 | + print('\n') |
177 | - "execution_count": null, | 177 | + print(_star.center(30,'*')) |
178 | - "metadata": {}, | 178 | + |
179 | - "outputs": [], | 179 | + |
180 | - "source": [] | 180 | + # 검색하고자 하는 url을 입력받는다 |
181 | - } | 181 | + _url = input('검색하고자 하는 url을 입력해주세요: ') |
182 | - ], | 182 | + print('comment_list를 가져오는 중.....') |
183 | - "metadata": { | 183 | + cList = getData(_url) |
184 | - "kernelspec": { | 184 | + refine_time(cList) |
185 | - "display_name": "Python 3", | 185 | + print('\n') |
186 | - "language": "python", | 186 | + print('comment_list를 다 가져왔습니다!') |
187 | - "name": "python3" | 187 | + |
188 | - }, | 188 | + while(True): |
189 | - "language_info": { | 189 | + print('***********************************') |
190 | - "codemirror_mode": { | 190 | + print('1.닉네임 기반 검색') |
191 | - "name": "ipython", | 191 | + print('2.키워드 기반 검색') |
192 | - "version": 3 | 192 | + print('3.작성시간 기반 검색') |
193 | - }, | 193 | + menu = input('메뉴를 입력해주세요: ') |
194 | - "file_extension": ".py", | 194 | + |
195 | - "mimetype": "text/x-python", | 195 | + if(menu == str(1)): |
196 | - "name": "python", | 196 | + print('***********************************') |
197 | - "nbconvert_exporter": "python", | 197 | + inputID = input('검색할 닉네임 앞 4자리를 입력해주세요(전 단계로 가시려면 -1을 입력해주세요): ') |
198 | - "pygments_lexer": "ipython3", | 198 | + if(inputID == str(-1)): |
199 | - "version": "3.7.3" | 199 | + continue |
200 | - } | 200 | + _result = search_by_author(cList,inputID) |
201 | - }, | 201 | + printResult(_result) |
202 | - "nbformat": 4, | 202 | + print(_result) |
203 | - "nbformat_minor": 2 | 203 | + elif(menu == str(2)): |
204 | -} | 204 | + print('***********************************') |
205 | + inputKW = input('검색할 키워드를 입력해주세요(전 단계로 가시려면 -1을 입력해주세요): ') | ||
206 | + if(inputKW == str(-1)): | ||
207 | + continue | ||
208 | + _result = search_by_keyword(cList,inputKW) | ||
209 | + printResult(_result) | ||
210 | + elif(menu == str(3)): | ||
211 | + print('***********************************') | ||
212 | + print('전 단계로 돌아가시려면 -1을 입력해주세요') | ||
213 | + startTime = input('검색할 시간대의 시작일을 입력해주세요(YYYY-MM-DD): ') | ||
214 | + endTime = input('검색할 시간대의 마지막 일을 입력해주세요(YYYY-MM-DD): ') | ||
215 | + | ||
216 | + if(startTime == str(-1) or endTime == str(-1)): | ||
217 | + continue | ||
218 | + | ||
219 | + _result = search_by_time(cList,startTime,endTime) | ||
220 | + printResult(_result) | ||
221 | + else: | ||
222 | + print('잘못된 입력입니다') | ||
223 | + continue | ||
224 | + | ||
225 | + | ||
226 | + | ||
227 | +main() | ... | ... |
-
Please register or login to post a comment