함수기능추가 및 모듈분리

장준영
Commit 0ef498065ae817c8b126228540a71b5b0f81269d 0ef49806 1 parent 7500dcc6
Showing 3 changed files with 161 additions and 51 deletions
naverNews/main.py
naverNews/naverNews.md
naverNews/naverNews_crawling.py
--- a/naverNews/main.py 0 → 100644
View file @0ef4980
+++ b/naverNews/main.py 0 → 100644
View file @0ef4980
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "******************************\n",
+      "\n",
+      "\n",
+      "***< Naver News Crawling >****\n",
+      "\n",
+      "\n",
+      "******************************\n",
+      "검색하고자 하는 url을 입력해주세요: https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175\n",
+      "comment_list를 가져오는 중.....\n",
+      "Message: element not interactable\n",
+      "  (Session info: chrome=78.0.3904.97)\n",
+      "\n",
+      "[{'userID': 'ydja****', 'comment': '옹벤져스 너무웃겨', 'time': '6일 전'}, {'userID': 'kims****', 'comment': '사랑해요 옹벤져스! 준기엄마 다리 찢을 때 웃겨죽는 줄 진짜 츤데레언니들', 'time': '6일 전'}, {'userID': 'hoho****', 'comment': '옹벤져스가 다른 마을 살인마 잡는 이야기로 시즌 2. 갑시다', 'time': '6일 전'}]\n",
+      "comment_list를 다 가져왔습니다!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import naverNews_crawling \n",
+    "from time import sleep\n",
+    "\n",
+    "def print_cList(c_List) :\n",
+    "    for item in c_List :\n",
+    "        print(item)\n",
+    "\n",
+    "def search_by_author(c_List,user_ID) :\n",
+    "        result_List = []\n",
+    "        for item in c_List :\n",
+    "            print(item['userID'])\n",
+    "            if ( user_ID in item['userID']) :\n",
+    "                result_List.append(item)\n",
+    "        return result_List\n",
+    "\n",
+    "def search_by_keyword(c_List,keyword) :\n",
+    "        result_List = []\n",
+    "        for item in c_List :\n",
+    "            print(item['comment'])\n",
+    "            if ( keyword in item['comment']) :\n",
+    "                result_List.append(item)\n",
+    "        return result_List\n",
+    "    \n",
+    "'''\n",
+    "def search_by_time(c_List,_time) :\n",
+    "        result_List = []\n",
+    "        for item in c_List :\n",
+    "            print(item['time'])\n",
+    "            if ( keyword in item['comment']) :\n",
+    "                result_List.append(item)\n",
+    "        return result_List    \n",
+    "        \n",
+    "'''    \n",
+    "\n",
+    "def main ():\n",
+    "    ## 시작화면\n",
+    "    \n",
+    "    _star = '*'\n",
+    "    print(_star.center(30,'*'))\n",
+    "    print('\\n')\n",
+    "    headString = '< Naver News Crawling >'\n",
+    "    print(headString.center(30,'*'))\n",
+    "    print('\\n')\n",
+    "    print(_star.center(30,'*'))\n",
+    "    \n",
+    "    \n",
+    "    # 검색하고자 하는 url을 입력받는다\n",
+    "    _url = input('검색하고자 하는 url을 입력해주세요: ')\n",
+    "    print('comment_list를 가져오는 중.....')\n",
+    "    cList = naverNews_crawling.getData(_url)\n",
+    "    print('comment_list를 다 가져왔습니다!')\n",
+    "\n",
+    "main()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/naverNews/naverNews.md
View file @0ef4980
+++ b/naverNews/naverNews.md
View file @0ef4980
 1. Data 받아오기
     1) selenuim을 이용하여 웹페이지에서 데이터를 검색
-    2) 원하는 URL 입력받음(구현예정)
+    2) 원하는 URL 입력받는다
     3) headless하게 구현하기 위해 chrome option 적용하여 driver 생성
     4) naverNews는 댓글 영역 하단 부 '더보기'를 지속적으로 눌러줘야하므로
        driver의 find_element_by_css_selector함수로 해당 class인 
@@ -13,9 +13,15 @@
     3) 저장된 dictionary list(info_dic)을 최종 결과 리스트인 naverNewsList에 저장한다.
 3. 함수 구현
-    위에서 받아온 데이터를 바탕으로 기능 구현 예정
     1) KEYWORD 기반 검색 기능
     2) 가장 자주 나온 단어 검색 기능
     3) ID 기반 검색 기능
     4) 시간 대별 검색 기능
-    등 여러 함수 구현 예정
\ No newline at end of file
+    등 여러 함수 구현 예정
+    
+=> 수정사항
+    
+    data를 get하여 정제하는 파일을 모듈로 분리해 내어 list형태로 저장된 데이터셋을 반환하여
+    main 에서 사용할 수 있도록 한다. 이 후 main에서 리스트를 받아와 url을 입력받아 데이터를
+    받아오는 방식으로 사용한다. 이 후, keyword기반, id기반, 시간대 기반 검색 함수를 구현하였고
+    시간대별 검색 함수의 기능 보강과 가장 자주 나온 단어 검색 기능을 추가 구현할 예정이다.
\ No newline at end of file
--- a/naverNews/naverNews_crawling.py
View file @0ef4980
+++ b/naverNews/naverNews_crawling.py
View file @0ef4980
@@ -3,54 +3,51 @@ from selenium.common import exceptions
 from bs4 import BeautifulSoup
 import time
-## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>)
+def getData(url):
-
+    ## chrome option걸기 (headless하게 웹 크롤링 수행하기 위해<웹페이지 안보이게 하기>)
-options = webdriver.ChromeOptions()
+    options = webdriver.ChromeOptions()
-#options.add_argument('headless')
+    #options.add_argument('headless')
-#options.add_argument("disable-gpu")
+    #options.add_argument("disable-gpu")
-
+    #_url = "https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175" # 크롤링할 URL
-_url = "https://entertain.naver.com/ranking/comment/list?oid=144&aid=0000642175" # 크롤링할 URL
+    _url = url # 크롤링할 URL
-webDriver = "C:\\Users\\user\\Desktop\\chromedriver_win32\\chromedriver.exe"  # 내 웹드라이버 위치
+    webDriver = "C:\\Users\\user\\Desktop\\chromedriver_win32\\chromedriver.exe"  # 내 웹드라이버 위치
-
+    driver = webdriver.Chrome(webDriver,chrome_options=options)
-
+    #driver = webdriver.Chrome(webDriver)
-driver = webdriver.Chrome(webDriver,chrome_options=options)
+    driver.get(_url)
-#driver = webdriver.Chrome(webDriver)
+    pageCnt = 0
-driver.get(_url)
+    driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함
-
+    try:
-pageCnt = 0
+        while True: # 댓글 페이지 끝날때까지 돌림
-
+            #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)
-driver.implicitly_wait(3) # 페이지가 다 로드 될때까지 기다리게함
+            driver.find_element_by_css_selector(".u_cbox_btn_more").click() 
-
+            pageCnt = pageCnt+1
-try:
+        
-    while True: # 댓글 페이지 끝날때까지 돌림
+    except exceptions.ElementNotVisibleException as e: # 페이지가 끝남
-        #driver의 find_element_by_css_selector함수로 '네이버 뉴스'의 댓글 '더보기' 버튼을 찾아서 계속 클릭해준다(끝까지)
+        pass
-        driver.find_element_by_css_selector(".u_cbox_btn_more").click() 
+        
-        pageCnt = pageCnt+1
+    except Exception as e: # 다른 예외 발생시 확인
+        print(e)
-except exceptions.ElementNotVisibleException as e: # 페이지가 끝남
+    pageSource = driver.page_source # 페이지 소스를 따와서
-    pass
+    result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용
+
+    # nickname, text, time을 raw하게 뽑아온다
+    comments_raw = result.find_all("span", {"class" : "u_cbox_contents"})
+    nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"})
+    times_raw = result.find_all("span", {"class" : "u_cbox_date"})
+
+    # nickname, text, time 값 만을 뽑아내어 리스트로 정리한다
+    comments = [comment.text for comment in comments_raw]
+    nicknames = [nickname.text for nickname in nicknames_raw]
+    times = [time.text for time in times_raw]
-except Exception as e: # 다른 예외 발생시 확인
+    naverNewsList = []
-    print(e)
-pageSource = driver.page_source # 페이지 소스를 따와서
-result = BeautifulSoup(pageSource, "lxml") # 빠르게 뽑아오기 위해 lxml 사용
-
-# nickname, text, time을 raw하게 뽑아온다
-comments_raw = result.find_all("span", {"class" : "u_cbox_contents"})
-nicknames_raw = result.find_all("span", {"class" : "u_cbox_nick"})
-times_raw = result.find_all("span", {"class" : "u_cbox_date"})
-
-# nickname, text, time 값 만을 뽑아내어 리스트로 정리한다
-comments = [comment.text for comment in comments_raw]
-nicknames = [nickname.text for nickname in nicknames_raw]
-times = [time.text for time in times_raw]
-
-naverNewsList = []
-
-for i in range(len(comments)):
-    info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}
-    naverNewsList.append(info_dic)
-
-print(naverNewsList[:3])
-#driver.quit()
\ No newline at end of file
+    for i in range(len(comments)):
+        info_dic = {'userID' : nicknames[i], 'comment' : comments[i], 'time' : times[i]}
+        naverNewsList.append(info_dic)
+        
+    print(naverNewsList[:3])
+    
+    return naverNewsList
+    #driver.quit()
\ No newline at end of file