함수기능추가 및 모듈분리

장준영
Commit 0ef498065ae817c8b126228540a71b5b0f81269d 0ef49806 1 parent 7500dcc6
Showing 3 changed files with 161 additions and 51 deletions
naverNews/main.py
naverNews/naverNews.md
naverNews/naverNews_crawling.py
--- a/naverNews/main.py 0 → 100644
View file @0ef4980
+++ b/naverNews/main.py 0 → 100644
View file @0ef4980
+ {
+  "cells": [
+   {
+    "cell_type": "code",
+    "execution_count": 2,
+    "metadata": {},
+    "outputs": [
+     {
+      "name": "stdout",
+      "output_type": "stream",
+      "text": [
+       "******************************\n",
+       "\n",
+       "\n",
+       "***< Naver News Crawling >****\n",
+       "\n",
+       "\n",
+       "******************************\n",
+       "검색하고자 하는 url을 입력해주세요: https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175\n",
+       "comment_list를 가져오는 중.....\n",
+       "Message: element not interactable\n",
+       "  (Session info: chrome=78.0.3904.97)\n",
+       "\n",
+       "[{'userID': 'ydja****', 'comment': '옹벤져스 너무웃겨', 'time': '6일 전'}, {'userID': 'kims****', 'comment': '사랑해요 옹벤져스! 준기엄마 다리 찢을 때 웃겨죽는 줄 진짜 츤데레언니들', 'time': '6일 전'}, {'userID': 'hoho****', 'comment': '옹벤져스가 다른 마을 살인마 잡는 이야기로 시즌 2. 갑시다', 'time': '6일 전'}]\n",
+       "comment_list를 다 가져왔습니다!\n"
+      ]
+     }
+    ],
+    "source": [
+     "import naverNews_crawling \n",
+     "from time import sleep\n",
+     "\n",
+     "def print_cList(c_List) :\n",
+     "    for item in c_List :\n",
+     "        print(item)\n",
+     "\n",
+     "def search_by_author(c_List,user_ID) :\n",
+     "        result_List = []\n",
+     "        for item in c_List :\n",
+     "            print(item['userID'])\n",
+     "            if ( user_ID in item['userID']) :\n",
+     "                result_List.append(item)\n",
+     "        return result_List\n",
+     "\n",
+     "def search_by_keyword(c_List,keyword) :\n",
+     "        result_List = []\n",
+     "        for item in c_List :\n",
+     "            print(item['comment'])\n",
+     "            if ( keyword in item['comment']) :\n",
+     "                result_List.append(item)\n",
+     "        return result_List\n",
+     "    \n",
+     "'''\n",
+     "def search_by_time(c_List,_time) :\n",
+     "        result_List = []\n",
+     "        for item in c_List :\n",
+     "            print(item['time'])\n",
+     "            if ( keyword in item['comment']) :\n",
+     "                result_List.append(item)\n",
+     "        return result_List    \n",
+     "        \n",
+     "'''    \n",
+     "\n",
+     "def main ():\n",
+     "    ## 시작화면\n",
+     "    \n",
+     "    _star = '*'\n",
+     "    print(_star.center(30,'*'))\n",
+     "    print('\\n')\n",
+     "    headString = '< Naver News Crawling >'\n",
+     "    print(headString.center(30,'*'))\n",
+     "    print('\\n')\n",
+     "    print(_star.center(30,'*'))\n",
+     "    \n",
+     "    \n",
+     "    # 검색하고자 하는 url을 입력받는다\n",
+     "    _url = input('검색하고자 하는 url을 입력해주세요: ')\n",
+     "    print('comment_list를 가져오는 중.....')\n",
+     "    cList = naverNews_crawling.getData(_url)\n",
+     "    print('comment_list를 다 가져왔습니다!')\n",
+     "\n",
+     "main()"
+    ]
+   }
+  ],
+  "metadata": {
+   "kernelspec": {
+    "display_name": "Python 3",
+    "language": "python",
+    "name": "python3"
+   },
+   "language_info": {
+    "codemirror_mode": {
+     "name": "ipython",
+     "version": 3
+    },
+    "file_extension": ".py",
+    "mimetype": "text/x-python",
+    "name": "python",
+    "nbconvert_exporter": "python",
+    "pygments_lexer": "ipython3",
+    "version": "3.7.3"
+   }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+ }
--- a/naverNews/naverNews.md
View file @0ef4980
+++ b/naverNews/naverNews.md
View file @0ef4980
 1. Data 받아오기
     1) selenuim을 이용하여 웹페이지에서 데이터를 검색
-     2) 원하는 URL 입력받음(구현예정)
+     2) 원하는 URL 입력받는다
     3) headless하게 구현하기 위해 chrome option 적용하여 driver 생성
     4) naverNews는 댓글 영역 하단 부 '더보기'를 지속적으로 눌러줘야하므로
        driver의 find_element_by_css_selector함수로 해당 class인 
@@ -13,9 +13,15 @@
     3) 저장된 dictionary list(info_dic)을 최종 결과 리스트인 naverNewsList에 저장한다.
     
 3. 함수 구현
-     위에서 받아온 데이터를 바탕으로 기능 구현 예정
     1) KEYWORD 기반 검색 기능
     2) 가장 자주 나온 단어 검색 기능
     3) ID 기반 검색 기능
     4) 시간 대별 검색 기능
-     등 여러 함수 구현 예정
\ No newline at end of file
+     등 여러 함수 구현 예정
+     
+ => 수정사항
+     
+     data를 get하여 정제하는 파일을 모듈로 분리해 내어 list형태로 저장된 데이터셋을 반환하여
+     main 에서 사용할 수 있도록 한다. 이 후 main에서 리스트를 받아와 url을 입력받아 데이터를
+     받아오는 방식으로 사용한다. 이 후, keyword기반, id기반, 시간대 기반 검색 함수를 구현하였고
+     시간대별 검색 함수의 기능 보강과 가장 자주 나온 단어 검색 기능을 추가 구현할 예정이다.
\ No newline at end of file
--- a/naverNews/naverNews_crawling.py
View file @0ef4980
+++ b/naverNews/naverNews_crawling.py
View file @0ef4980
@@ -3,54 +3,51 @@ from selenium.common import exceptions
 from bs4 import BeautifulSoup
 import time
 
- ## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>)
- 
- options = webdriver.ChromeOptions()
- #options.add_argument('headless')
- #options.add_argument("disable-gpu")
- 
- _url = "https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175" # 크롤링할 URL
- webDriver = "C:\\Users\\user\\Desktop\\chromedriver_win32\\chromedriver.exe"  # 내 웹드라이버 위치
- 
- 
- driver = webdriver.Chrome(webDriver,chrome_options=options)
- #driver = webdriver.Chrome(webDriver)
- driver.get(_url)
- 
- pageCnt = 0
- 
- driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함
- 
- try:
-     while True: # 댓글 페이지 끝날때까지 돌림
-         #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)
-         driver.find_element_by_css_selector(".u_cbox_btn_more").click() 
-         pageCnt = pageCnt+1
+ def getData(url):
+     ## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>)
+     options = webdriver.ChromeOptions()
+     #options.add_argument('headless')
+     #options.add_argument("disable-gpu")
+     #_url = "https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175" # 크롤링할 URL
+     _url = url # 크롤링할 URL
+     webDriver = "C:\\Users\\user\\Desktop\\chromedriver_win32\\chromedriver.exe"  # 내 웹드라이버 위치
+     driver = webdriver.Chrome(webDriver,chrome_options=options)
+     #driver = webdriver.Chrome(webDriver)
+     driver.get(_url)
+     pageCnt = 0
+     driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함
+     try:
+         while True: # 댓글 페이지 끝날때까지 돌림
+             #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)
+             driver.find_element_by_css_selector(".u_cbox_btn_more").click() 
+             pageCnt = pageCnt+1
+         
+     except exceptions.ElementNotVisibleException as e: # 페이지가 끝남
+         pass
+         
+     except Exception as e: # 다른 예외 발생시 확인
+         print(e)
     
- except exceptions.ElementNotVisibleException as e: # 페이지가 끝남
-     pass
+     pageSource = driver.page_source # 페이지 소스를 따와서
+     result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용
+ 
+     # nickname, text, time을 raw하게 뽑아온다
+     comments_raw = result.find_all("span", {"class" : "u_cbox_contents"})
+     nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"})
+     times_raw = result.find_all("span", {"class" : "u_cbox_date"})
+ 
+     # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다
+     comments = [comment.text for comment in comments_raw]
+     nicknames = [nickname.text for nickname in nicknames_raw]
+     times = [time.text for time in times_raw]
     
- except Exception as e: # 다른 예외 발생시 확인
-     print(e)
+     naverNewsList = []
     
- pageSource = driver.page_source # 페이지 소스를 따와서
- result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용
- 
- # nickname, text, time을 raw하게 뽑아온다
- comments_raw = result.find_all("span", {"class" : "u_cbox_contents"})
- nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"})
- times_raw = result.find_all("span", {"class" : "u_cbox_date"})
- 
- # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다
- comments = [comment.text for comment in comments_raw]
- nicknames = [nickname.text for nickname in nicknames_raw]
- times = [time.text for time in times_raw]
- 
- naverNewsList = []
- 
- for i in range(len(comments)):
-     info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}
-     naverNewsList.append(info_dic)
- 
- print(naverNewsList[:3])
- #driver.quit()
\ No newline at end of file
+     for i in range(len(comments)):
+         info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}
+         naverNewsList.append(info_dic)
+         
+     print(naverNewsList[:3])
+     
+     return naverNewsList
+     #driver.quit()
\ No newline at end of file