김태영

Merge branch 'feature/crawling' into 'master'

Feature/crawling

teachable machine 모델 만드는 데 쓰이는 이미지 크롤링

See merge request !6
1 -/node_modules/
...\ No newline at end of file ...\ No newline at end of file
1 +/node_modules/
2 +/python/model
3 +/python/chromedriver
...\ No newline at end of file ...\ No newline at end of file
......
1 +from selenium import webdriver
2 +from selenium.webdriver.common.keys import Keys
3 +import time
4 +import urllib.request
5 +import os
6 +from multiprocessing import Pool #멀티쓰레딩
7 +
8 +# 구글 드라이버를 크롬 버전에 맞게 설치하고 경로를 입력해준다.
9 +# 이곳에 크롬 드라이버 경로를 입력해주세요.
10 +chromedriver_path = "/Users/kimtaeyoung/Desktop/project/opensource_termproject/2020-02-OSS-TermProject/python/chromedriver"
11 +# 몇개의 파일을 크롤링할지
12 +crawling_num = 500
13 +
14 +# headless 셀레니움
15 +# 크롬 드라이버에 적용할 옵션들
16 +options = webdriver.ChromeOptions()
17 +options.add_argument('headless')
18 +options.add_argument('window-size=1920x1080')
19 +options.add_argument("disable-gpu")
20 +# 혹은 options.add_argument("--disable-gpu")
21 +
22 +# 폴더를 확인하고 없다면 만들어준다.
23 +def createFolder(directory):
24 + try:
25 + if not os.path.exists(directory):
26 + os.makedirs(directory)
27 + except OSError:
28 + print ('Error: Creating directory. ' + directory)
29 +
30 +# 크롤링 할 단어들을 받아온다.
31 +# 크롤링할 단어는 keywords.txt에 입력하면 된다.
32 +def get_keywords(keywords_file='python/keywords.txt'):
33 + # read search keywords from file
34 + with open(keywords_file, 'r', encoding='utf-8-sig') as f:
35 + text = f.read()
36 + lines = text.split('\n')
37 + lines = filter(lambda x: x != '' and x is not None, lines)
38 + keywords = sorted(set(lines))
39 +
40 + print('{} keywords found: {}'.format(len(keywords), keywords))
41 +
42 + # re-save sorted keywords
43 + with open(keywords_file, 'w+', encoding='utf-8') as f:
44 + for keyword in keywords:
45 + f.write('{}\n'.format(keyword))
46 +
47 + return keywords
48 +
49 +# 크롤링
50 +# get_keywords의 결과를 파라미터로 넣어주면 된다.
51 +def crawling(search_name):
52 + driver = webdriver.Chrome(chromedriver_path, chrome_options=options) #headless를 위한 옵션을 추가
53 + driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&authuser=0&ogbl")
54 + elem = driver.find_element_by_name("q")
55 + elem.send_keys(search_name)
56 + elem.send_keys(Keys.RETURN)
57 +
58 + SCROLL_PAUSE_TIME = 1
59 + # Get scroll height
60 + last_height = driver.execute_script("return document.body.scrollHeight")
61 + while True:
62 + # Scroll down to bottom
63 + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
64 + # Wait to load page
65 + time.sleep(SCROLL_PAUSE_TIME)
66 + # Calculate new scroll height and compare with last scroll height
67 + new_height = driver.execute_script("return document.body.scrollHeight")
68 + if new_height == last_height:
69 + try:
70 + driver.find_element_by_css_selector(".mye4qd").click()
71 + except:
72 + break
73 + last_height = new_height
74 +
75 + images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
76 + count = 1
77 +
78 + #폴더가 없다면 폴더를 만들어준다.
79 + createFolder("python/model/"+search_name)
80 +
81 + for image in images:
82 + try:
83 + image.click()
84 + # 이미지가 로딩되는 속도. 안정적으로는 2-3초가 적당.
85 + time.sleep(2)
86 + imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
87 + opener=urllib.request.build_opener()
88 + opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
89 + urllib.request.install_opener(opener)
90 + urllib.request.urlretrieve(imgUrl, "python/model/"+search_name+"/"+str(count) + ".jpg")
91 + count = count + 1
92 +
93 + # 크롤링할 개수 설정.
94 + if count > crawling_num:
95 + break
96 + except:
97 + pass
98 +
99 + # 끝나고 크롬 드라이버를 종료해준다
100 + driver.close()
101 +
102 +if __name__ == '__main__':
103 +
104 + #모델 폴더를 만들어준다.
105 + createFolder("python/model/")
106 +
107 + pool = Pool(processes=3) # 3개의 프로세스를 사용합니다.
108 + pool.map(crawling, get_keywords())
...\ No newline at end of file ...\ No newline at end of file
1 +inside background
2 +person
3 +with mask
1 +selenium
...\ No newline at end of file ...\ No newline at end of file