Showing
3 changed files
with
78 additions
and
0 deletions
data/datacrawling.py
0 → 100644
| 1 | +# -*- coding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +from bs4 import BeautifulSoup | ||
| 4 | +from urllib.request import urlopen | ||
| 5 | +import pandas as pd | ||
| 6 | +import requests | ||
| 7 | +import os | ||
| 8 | + | ||
| 9 | +url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20141119000000000012_1/' | ||
| 10 | + | ||
| 11 | +number = 0 | ||
| 12 | + | ||
| 13 | +def makecsvfile(day): | ||
| 14 | + | ||
| 15 | + # file name setting | ||
| 16 | + output_file = str(day)+'data.csv' | ||
| 17 | + | ||
| 18 | + # dataframe 생성 | ||
| 19 | + df = pd.DataFrame(columns=['row_num','aucng_de', 'pblmng_whsal_mrkt_nm','pblmng_whsal_mrkt_cd', 'cpr_nm', 'cpr_cd', 'prdlst_nm', 'prdlst_cd', 'spcies_nm','spcies_cd','grad','grad_cd','delngbundle_qy','stndrd','stndrd_cd','delng_qy','mumm_amt','avrg_amt','mxmm_amt','auc_co']) | ||
| 20 | + | ||
| 21 | + # 기본 number setting | ||
| 22 | + i = 0 # 날짜별 row | ||
| 23 | + number = 0 | ||
| 24 | + | ||
| 25 | + while(True): | ||
| 26 | + | ||
| 27 | + # url 생성 & data 받아오기 | ||
| 28 | + myurl = url + str(number*1000+1) + '/'+str((number+1)*1000) + '?AUCNG_DE='+str(day) | ||
| 29 | + data = urlopen(myurl).read() | ||
| 30 | + soup = BeautifulSoup(data, 'html.parser') | ||
| 31 | + | ||
| 32 | + # data error check | ||
| 33 | + result_code = soup.find('result') | ||
| 34 | + result_code = result_code.code.string | ||
| 35 | + if result_code != 'INFO-000': | ||
| 36 | + print(result_code) | ||
| 37 | + break | ||
| 38 | + | ||
| 39 | + # data number check | ||
| 40 | + start_num = int(str(soup.find('startrow'))[10:-11]) | ||
| 41 | + total_num = int(str(soup.find('totalcnt'))[10:-11]) | ||
| 42 | + print(str(soup.find('startrow'))[10:-11]) | ||
| 43 | + if total_num < start_num : | ||
| 44 | + print('find all') | ||
| 45 | + break | ||
| 46 | + | ||
| 47 | + # if result is fine | ||
| 48 | + items = soup.find_all('row') | ||
| 49 | + for item in items: | ||
| 50 | + df.loc[i] = [item.row_num.string, item.aucng_de.string, item.pblmng_whsal_mrkt_nm.string, item.pblmng_whsal_mrkt_cd.string, item.cpr_nm.string, item.cpr_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.grad.string, item.grad_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.delng_qy.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.auc_co.string] | ||
| 51 | + i += 1 | ||
| 52 | + | ||
| 53 | + # 다음 1000개 | ||
| 54 | + number += 1 | ||
| 55 | + | ||
| 56 | + # 결과 확인을 위한 print | ||
| 57 | + print(str(day), ' : ', str(i)) | ||
| 58 | + # csv 파일로 내보내기 | ||
| 59 | + df.to_csv(os.path.join('./', output_file), encoding='euc-kr', index=False) | ||
| 60 | + | ||
| 61 | +def checkdatanum(day): | ||
| 62 | + myurl = url +'1/1?AUCNG_DE='+str(day) | ||
| 63 | + | ||
| 64 | + req = requests.get(myurl) | ||
| 65 | + html = req.text | ||
| 66 | + soup = BeautifulSoup(html, 'html.parser') | ||
| 67 | + product_num = soup.find('totalcnt') | ||
| 68 | + product_num = int(str(product_num)[10:-11]) | ||
| 69 | + print(day,':',product_num) | ||
| 70 | + return product_num | ||
| 71 | + | ||
| 72 | + | ||
| 73 | +days=[20200413, 20200414, 20200415, 20200416, 20200417, 20200418, 20200412] | ||
| 74 | + | ||
| 75 | +for day in days: | ||
| 76 | + number += checkdatanum(day) | ||
| 77 | + | ||
| 78 | +print('week : ', number) |
~$진행과정.docx
0 → 100644
No preview for this file type
진행과정.docx
0 → 100644
No preview for this file type
-
Please register or login to post a comment