distributiondata.py
3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import requests
import os, sys
import datetime
url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160722000000000352_1/'
# 원하는 사이트에 맞게 항목만 바꿔주면 되는 함수
def makecsvfile(day):
# file name setting
output_file = str(day)+'data.csv'
# dataframe 생성
########################## data column 변경 필수 ################################
df = pd.DataFrame(columns=['row_num','examin_de','examin_se_nm','examin_se_code','examin_area_name','examin_area_code','examin_mrkt_nm','examin_mrkt_code','std_mrkt_nm','std_mrkt_code','EXAMIN_PRDLST_NM','EXAMIN_PRDLST_CODE','EXAMIN_SPCIES_NM','EXAMIN_SPCIES_CODE','STD_LCLAS_NM','STD_LCLAS_CO','STD_PRDLST_NM','STD_PRDLST_CODE','STD_SPCIES_NM','STD_SPCIES_CODE','EXAMIN_UNIT_NM','EXAMIN_UNIT','STD_UNIT_NM','STD_UNIT_CODE','EXAMIN_GRAD_NM','EXAMIN_GRAD_CODE','STD_GRAD_NM','STD_GRAD_CODE','TODAY_PRIC','BFRT_PRIC','IMP_TRADE','TRADE_AMT'])
# 기본 number setting
i = 0 # 날짜별 row
number = 0
while(True):
# url 생성 & data 받아오기
myurl = url + str(number*1000+1) + '/'+str((number+1)*1000) + '?AUCNG_DE='+str(day)
data = urlopen(myurl).read()
soup = BeautifulSoup(data, 'html.parser')
# data error check
result_code = soup.find('result')
result_code = result_code.code.string
if result_code != 'INFO-000':
print(result_code)
break
# data number check
start_num = int(str(soup.find('startrow'))[10:-11])
total_num = int(str(soup.find('totalcnt'))[10:-11])
print(str(soup.find('startrow'))[10:-11])
if total_num < start_num :
print('find all')
break
# if result is fine
items = soup.find_all('row')
for item in items:
########################################## data column change ##########################################
df.loc[i] = [item.row_num.string,item.EXAMIN_DE.string,item.EXAMIN_SE_NM.string,item.EXAMIN_SE_CODE.string,item.EXAMIN_AREA_NAME.string,item.EXAMIN_AREA_CODE.string,item.EXAMIN_MRKT_NM.string,item.EXAMIN_MRKT_CODE.string,item.STD_MRKT_NM.string,item.STD_MRKT_CODE.string,item.EXAMIN_PRDLST_NM.string,item.EXAMIN_PRDLST_CODE.string,item.EXAMIN_SPCIES_NM.string,item.EXAMIN_SPCIES_CODE.string,item.STD_LCLAS_NM.string,item.STD_LCLAS_CO.string,item.STD_PRDLST_NM.string,item.STD_PRDLST_CODE.string,item.STD_SPCIES_NM.string,item.STD_SPCIES_CODE.string,item.EXAMIN_UNIT_NM.string,item.EXAMIN_UNIT.string,item.STD_UNIT_NM.string,item.STD_UNIT_CODE.string,item.EXAMIN_GRAD_NM.string,item.EXAMIN_GRAD_CODE.string,item.STD_GRAD_NM.string,item.STD_GRAD_CODE.string,item.TODAY_PRIC.string,item.BFRT_PRIC.string,item.IMP_TRADE.string,item.TRADE_AMT.string]
i += 1
# 다음 1000개
number += 1
# 결과 확인을 위한 print
print(str(day), ' : ', str(i))
# csv 파일로 내보내기
############################################# change saved file directory ####################################
df.to_csv(os.path.join('./jointmarketdata', output_file), encoding='euc-kr', index=False)
def checkdatanum(day):
myurl = url +'1/1?AUCNG_DE='+str(day)
req = requests.get(myurl)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
product_num = soup.find('totalcnt')
product_num = int(str(product_num)[10:-11])
print(day,':',product_num)
return product_num
i = 0
##################################### 시작일 & 종료일 변경 ############################################
last_day = datetime.date(2020,3,31)
first_day = datetime.date(2020,3,1) - datetime.timedelta(days=1)
while(first_day < last_day):
first_day += datetime.timedelta(days=1)
makecsvfile(first_day.strftime('%Y%m%d'))
sys.modules[__name__].__dict__.clear()