crawling.py 6.59 KB

Raw Blame History Permalink

import requests
import time
from bs4 import BeautifulSoup


def name_preprocessing(a):
    while True:
        b = a.find('[')
        c = a.find(']')
        if b == -1:
            break
        elif b == 0:
            a = a[c + 1:-1]
        else:
            break
    while True:
        d = a.find('(')
        e = a.find(')')
        if d == -1:
            break
        elif d == 0:
            a = a[e + 1:-1]
        else:
            break
    result = ''
    b = list(a)
    for i in b:
        if (ord(i) < 48) or (ord(i) > 57):
            if i == ' ':
                continue
            result += i
        else:
            break
    while True:
        if result == '':
            break
        if (result[-1] == '(') or (result[-1] == '[') or (result[-1] == ' '):
            result = result[0:-1]
        else:
            break
    return result


# 이마트 생수
def emart_water():
    with requests.get('http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page=1') as response:
        # response = requests.get('http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page=1')
        soup = BeautifulSoup(response.content, 'html.parser')
        sum = 0
        page_num = 1
        a = soup.find('span', "tx_ko")
        total = int(a.find('em').text.strip())
        print(total)
        E_data_milk = []
        unit_price = []
        while True:
            response = requests.get(
                'http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page={}'.format(page_num))
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('div', {'id': 'ty_thmb_view'})
            print(sum)
            if sum >= total:
                break  # 물건 개수보다 sum 이 많아지면 반복문을 빠져나옴(sum은 0으로 시작해서 for문이 돌 동안 1씩 더해짐)
            else:
                for emart in table.find_all('li', {'class': 'cunit_t232'}):  # 이마트의 상품 개별 태그를 따옴
                    price = int(emart.find('em', {'class': 'ssg_price'}).text.replace(",", ''))  # 개별 태그에서 가격을 따오고 가격을 비교하기위해 콤마를 제거
                    unitprice = emart.find('div', {'class': 'unit'})

                    if type(unitprice) == type(None):
                        sum += 1
                        continue
                    unit = unitprice.text
                    if unit.find('ml') == -1:
                        sum += 1
                        continue
                    emart_list = list(emart.find_all('a'))  # 개별 태그 안에 있는 것 중 상품명이 있는 a태그를 따와 리스트로 만듦
                    for goods in emart_list:
                        name_tag = goods.find('em', {'class': 'tx_ko'})  # 태그 안에도 상품명이 3개의 태그에 중복되게 있으므로 태그 한개만 따옴
                        if type(name_tag) == type(None):  # Nonetype error가 날 경우 반복문을 다시 실행
                            continue
                        else:
                            name = name_tag.text  # 이름 태그를 따와서 텍스트 형식으로 바꿔줌
                            sum += 1

                    name = name_preprocessing(name)
                    print(name)
                    E_data_milk.append([name, price])
                    unit_price.append(unit)
            time.sleep(10)  # 페이지를 넘기기 전 페이지 로딩 시간을 기달려주기 위해 10초를 기다림
            page_num += 1  # 1페이지 이후 다음 페이지들을 받아오기 위해 page_num에 1을 더해줌
            print(page_num)
        print(E_data_milk)
        print(len(E_data_milk))
        print(unit_price)
        print(len(unit_price))


# 이마트 과자
def emart_snack():
    with requests.get('http://emart.ssg.com/category/main.ssg?dispCtgId=6000023666&page=1') as response:
        # response = requests.get('http://emart.ssg.com/category/listCategoryItem.ssg?dispCtgId=6000023670&page=1')
        soup = BeautifulSoup(response.content, 'html.parser')
        sum = 0
        page_num = 1
        a = soup.find('span', "tx_ko")
        total = int(a.find('em').text.replace(',', ''))
        print(total)
        E_data_snack = []
        unit_price = []
        while True:
            response = requests.get(
                'http://emart.ssg.com/category/main.ssg?dispCtgId=6000023666&page={}'.format(page_num))
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('div', {'id': 'ty_thmb_view'})
            if sum >= total:
                break  # 물건 개수보다 sum 이 많아지면 반복문을 빠져나옴(sum은 0으로 시작해서 for문이 돌 동안 1씩 더해짐)
            else:
                for emart in table.find_all('li', {'class': 'cunit_t232'}):  # 이마트의 상품 개별 태그를 따옴
                    price = int(
                        emart.find('em', {'class': 'ssg_price'}).text.replace(",",
                                                                              ''))  # 개별 태그에서 가격을 따오고 가격을 비교하기위해 콤마를 제거
                    unitprice = emart.find('div', {'class': 'unit'})
                    sum += 1
                    if type(unitprice) == type(None):
                        sum += 1
                        continue
                    unit = unitprice.text
                    emart_list = list(emart.find_all('a'))  # 개별 태그 안에 있는 것 중 상품명이 있는 a태그를 따와 리스트로 만듦
                    for goods in emart_list:
                        name_tag = goods.find('em', {'class': 'tx_ko'})  # 태그 안에도 상품명이 3개의 태그에 중복되게 있으므로 태그 한개만 따옴
                        if type(name_tag) == type(None):  # Nonetype error가 날 경우 반복문을 다시 실행
                            continue
                        else:
                            name = name_tag.text  # 이름 태그를 따와서 텍스트 형식으로 바꿔줌
                            sum += 1
                    E_data_snack.append([name, price])
                    unit_price.append(unit)
            time.sleep(10)  # 페이지를 넘기기 전 페이지 로딩 시간을 기달려주기 위해 10초를 기다림
            page_num += 1  # 1페이지 이후 다음 페이지들을 받아오기 위해 page_num에 1을 더해줌
            print(page_num)
        print(E_data_snack)
        print(len(E_data_snack))
        print(unit_price)
        print(len(unit_price))


emart_water()