first crawling

최인훈
Commit b492d7d361769ffbaeff3a2b2de56dde33ae10c2 b492d7d3 1 parent dc48bab2
Showing 2 changed files with 182 additions and 0 deletions
crawling.py
practice.py
--- a/crawling.py 0 → 100644
View file @b492d7d
+++ b/crawling.py 0 → 100644
View file @b492d7d
+ import requests
+ import time
+ from bs4 import BeautifulSoup
+ 
+ 
+ def name_preprocessing(a):
+     while True:
+         b = a.find('[')
+         c = a.find(']')
+         if b == -1:
+             break
+         elif b == 0:
+             a = a[c + 1:-1]
+         else:
+             break
+     while True:
+         d = a.find('(')
+         e = a.find(')')
+         if d == -1:
+             break
+         elif d == 0:
+             a = a[e + 1:-1]
+         else:
+             break
+     result = ''
+     b = list(a)
+     for i in b:
+         if (ord(i) < 48) or (ord(i) > 57):
+             if i == ' ':
+                 continue
+             result += i
+         else:
+             break
+     while True:
+         if result == '':
+             break
+         if (result[-1] == '(') or (result[-1] == '[') or (result[-1] == ' '):
+             result = result[0:-1]
+         else:
+             break
+     return result
+ 
+ 
+ # 이마트 생수
+ def emart_water():
+     with requests.get('http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page=1') as response:
+         # response = requests.get('http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page=1')
+         soup = BeautifulSoup(response.content, 'html.parser')
+         sum = 0
+         page_num = 1
+         a = soup.find('span', "tx_ko")
+         total = int(a.find('em').text.strip())
+         print(total)
+         E_data_milk = []
+         unit_price = []
+         while True:
+             response = requests.get(
+                 'http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page={}'.format(page_num))
+             soup = BeautifulSoup(response.content, 'html.parser')
+             table = soup.find('div', {'id': 'ty_thmb_view'})
+             print(sum)
+             if sum >= total:
+                 break  # 물건 개수보다 sum 이 많아지면 반복문을 빠져나옴(sum은 0으로 시작해서 for문이 돌 동안 1씩 더해짐)
+             else:
+                 for emart in table.find_all('li', {'class': 'cunit_t232'}):  # 이마트의 상품 개별 태그를 따옴
+                     price = int(emart.find('em', {'class': 'ssg_price'}).text.replace(",", ''))  # 개별 태그에서 가격을 따오고 가격을 비교하기위해 콤마를 제거
+                     unitprice = emart.find('div', {'class': 'unit'})
+ 
+                     if type(unitprice) == type(None):
+                         sum += 1
+                         continue
+                     unit = unitprice.text
+                     if unit.find('ml') == -1:
+                         sum += 1
+                         continue
+                     emart_list = list(emart.find_all('a'))  # 개별 태그 안에 있는 것 중 상품명이 있는 a태그를 따와 리스트로 만듦
+                     for goods in emart_list:
+                         name_tag = goods.find('em', {'class': 'tx_ko'})  # 태그 안에도 상품명이 3개의 태그에 중복되게 있으므로 태그 한개만 따옴
+                         if type(name_tag) == type(None):  # Nonetype error가 날 경우 반복문을 다시 실행
+                             continue
+                         else:
+                             name = name_tag.text  # 이름 태그를 따와서 텍스트 형식으로 바꿔줌
+                             sum += 1
+ 
+                     name = name_preprocessing(name)
+                     print(name)
+                     E_data_milk.append([name, price])
+                     unit_price.append(unit)
+             time.sleep(10)  # 페이지를 넘기기 전 페이지 로딩 시간을 기달려주기 위해 10초를 기다림
+             page_num += 1  # 1페이지 이후 다음 페이지들을 받아오기 위해 page_num에 1을 더해줌
+             print(page_num)
+         print(E_data_milk)
+         print(len(E_data_milk))
+         print(unit_price)
+         print(len(unit_price))
+ 
+ 
+ # 이마트 과자
+ def emart_snack():
+     with requests.get('http://emart.ssg.com/category/main.ssg?dispCtgId=6000023666&page=1') as response:
+         # response = requests.get('http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page=1')
+         soup = BeautifulSoup(response.content, 'html.parser')
+         sum = 0
+         page_num = 1
+         a = soup.find('span', "tx_ko")
+         total = int(a.find('em').text.replace(',', ''))
+         print(total)
+         E_data_snack = []
+         unit_price = []
+         while True:
+             response = requests.get(
+                 'http://emart.ssg.com/category/main.ssg?dispCtgId=6000023666&page={}'.format(page_num))
+             soup = BeautifulSoup(response.content, 'html.parser')
+             table = soup.find('div', {'id': 'ty_thmb_view'})
+             if sum >= total:
+                 break  # 물건 개수보다 sum 이 많아지면 반복문을 빠져나옴(sum은 0으로 시작해서 for문이 돌 동안 1씩 더해짐)
+             else:
+                 for emart in table.find_all('li', {'class': 'cunit_t232'}):  # 이마트의 상품 개별 태그를 따옴
+                     price = int(
+                         emart.find('em', {'class': 'ssg_price'}).text.replace(",",
+                                                                               ''))  # 개별 태그에서 가격을 따오고 가격을 비교하기위해 콤마를 제거
+                     unitprice = emart.find('div', {'class': 'unit'})
+                     sum += 1
+                     if type(unitprice) == type(None):
+                         sum += 1
+                         continue
+                     unit = unitprice.text
+                     emart_list = list(emart.find_all('a'))  # 개별 태그 안에 있는 것 중 상품명이 있는 a태그를 따와 리스트로 만듦
+                     for goods in emart_list:
+                         name_tag = goods.find('em', {'class': 'tx_ko'})  # 태그 안에도 상품명이 3개의 태그에 중복되게 있으므로 태그 한개만 따옴
+                         if type(name_tag) == type(None):  # Nonetype error가 날 경우 반복문을 다시 실행
+                             continue
+                         else:
+                             name = name_tag.text  # 이름 태그를 따와서 텍스트 형식으로 바꿔줌
+                             sum += 1
+                     E_data_snack.append([name, price])
+                     unit_price.append(unit)
+             time.sleep(10)  # 페이지를 넘기기 전 페이지 로딩 시간을 기달려주기 위해 10초를 기다림
+             page_num += 1  # 1페이지 이후 다음 페이지들을 받아오기 위해 page_num에 1을 더해줌
+             print(page_num)
+         print(E_data_snack)
+         print(len(E_data_snack))
+         print(unit_price)
+         print(len(unit_price))
+ 
+ 
+ emart_water()
\ No newline at end of file
--- a/practice.py 0 → 100644
View file @b492d7d
+++ b/practice.py 0 → 100644
View file @b492d7d
+ a = '[노브랜드] 미네랄 워터 생수 (2L x 6개입)'
+ while True:
+     b = a.find('[')
+     c = a.find(']')
+     if b == -1:
+         break
+     elif b == 0:
+         a = a[c + 1:-1]
+     else:
+         break
+ while True:
+     d = a.find('(')
+     e = a.find(')')
+     if d == -1:
+         break
+     elif d == 0:
+         a = a[e + 1:-1]
+     else:
+         break
+ 
+ result = ''
+ b = list(a)
+ for i in b:
+     if (ord(i) < 48) or (ord(i) > 57):
+         if i == ' ':
+             continue
+         result += i
+     else:
+         break
+ while True:
+     if (result[-1] == '(') or (result[-1] == '[') or (result[-1] == ' '):
+         result = result[0:-1]
+     else:
+         break
+