Merge branch 'feature/220515_joongna_selenium_to_rod' into 'main'
Feature/220515 joongna selenium to rod # 중고나라 api 크롤링 패키지 변경 및 병렬 처리 1. 기존에 사용하던 go-selenium이 headless를 지원 안함 2. go-rod로 변경 및 goroutine으로 병렬 처리 See merge request !10
Showing
5 changed files
with
63 additions
and
65 deletions
1 | FROM golang:1.17.3 | 1 | FROM golang:1.17.3 |
2 | 2 | ||
3 | -ENV Xvfb :99 | ||
4 | -ENV DISPLAY=:99 | ||
5 | -ENV GO111MODULE=on | ||
6 | - | ||
7 | -RUN apt-get -y update | ||
8 | -RUN apt-get install -y wget xvfb gnupg | ||
9 | - | ||
10 | -RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - | ||
11 | -RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' | ||
12 | -RUN apt-get -y update | ||
13 | -RUN apt-get install -y google-chrome-stable | ||
14 | - | ||
15 | -RUN apt-get install -yqq unzip | ||
16 | -RUN wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip | ||
17 | -RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ | ||
18 | - | ||
19 | WORKDIR /src | 3 | WORKDIR /src |
20 | COPY . /src | 4 | COPY . /src |
21 | 5 | ||
6 | +RUN apt-get update | ||
7 | +RUN apt-get install -y libgconf-2-4 libatk1.0-0 libatk-bridge2.0-0 libgdk-pixbuf2.0-0 libgtk-3-0 libgbm-dev libnss3-dev libxss-dev libasound2 | ||
22 | RUN go build -o Joongna_api_server | 8 | RUN go build -o Joongna_api_server |
23 | 9 | ||
24 | EXPOSE 8080 | 10 | EXPOSE 8080 | ... | ... |
... | @@ -9,6 +9,7 @@ require ( | ... | @@ -9,6 +9,7 @@ require ( |
9 | github.com/bunsenapp/go-selenium v0.1.0 // indirect | 9 | github.com/bunsenapp/go-selenium v0.1.0 // indirect |
10 | github.com/caarlos0/env/v6 v6.9.1 // indirect | 10 | github.com/caarlos0/env/v6 v6.9.1 // indirect |
11 | github.com/fedesog/webdriver v0.0.0-20180606182539-99f36c92eaef // indirect | 11 | github.com/fedesog/webdriver v0.0.0-20180606182539-99f36c92eaef // indirect |
12 | + github.com/go-rod/rod v0.106.6 // indirect | ||
12 | github.com/joho/godotenv v1.4.0 // indirect | 13 | github.com/joho/godotenv v1.4.0 // indirect |
13 | github.com/labstack/echo/v4 v4.7.2 // indirect | 14 | github.com/labstack/echo/v4 v4.7.2 // indirect |
14 | github.com/labstack/gommon v0.3.1 // indirect | 15 | github.com/labstack/gommon v0.3.1 // indirect |
... | @@ -17,6 +18,9 @@ require ( | ... | @@ -17,6 +18,9 @@ require ( |
17 | github.com/tebeka/selenium v0.9.9 // indirect | 18 | github.com/tebeka/selenium v0.9.9 // indirect |
18 | github.com/valyala/bytebufferpool v1.0.0 // indirect | 19 | github.com/valyala/bytebufferpool v1.0.0 // indirect |
19 | github.com/valyala/fasttemplate v1.2.1 // indirect | 20 | github.com/valyala/fasttemplate v1.2.1 // indirect |
21 | + github.com/ysmood/goob v0.4.0 // indirect | ||
22 | + github.com/ysmood/gson v0.7.1 // indirect | ||
23 | + github.com/ysmood/leakless v0.7.0 // indirect | ||
20 | golang.org/x/crypto v0.0.0-20210817164053-32db794688a5 // indirect | 24 | golang.org/x/crypto v0.0.0-20210817164053-32db794688a5 // indirect |
21 | golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f // indirect | 25 | golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f // indirect |
22 | golang.org/x/sys v0.0.0-20211103235746-7861aae1554b // indirect | 26 | golang.org/x/sys v0.0.0-20211103235746-7861aae1554b // indirect | ... | ... |
... | @@ -21,6 +21,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs | ... | @@ -21,6 +21,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs |
21 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | 21 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= |
22 | github.com/fedesog/webdriver v0.0.0-20180606182539-99f36c92eaef h1:0z8rB8nAGEso7PhKS21wBjjxTp2uGPyZ6STzRc7mnBY= | 22 | github.com/fedesog/webdriver v0.0.0-20180606182539-99f36c92eaef h1:0z8rB8nAGEso7PhKS21wBjjxTp2uGPyZ6STzRc7mnBY= |
23 | github.com/fedesog/webdriver v0.0.0-20180606182539-99f36c92eaef/go.mod h1:RUn/EmpfFIGHvmeXmh+hk1UaCbjOXa6vl7/kx1b6wxw= | 23 | github.com/fedesog/webdriver v0.0.0-20180606182539-99f36c92eaef/go.mod h1:RUn/EmpfFIGHvmeXmh+hk1UaCbjOXa6vl7/kx1b6wxw= |
24 | +github.com/go-rod/rod v0.106.6 h1:zJorVPG7s8Xgbh7PkSySP4FNoo0OiougKaMb3j6zT6w= | ||
25 | +github.com/go-rod/rod v0.106.6/go.mod h1:xkZOchuKqTOkMOBkrzb7uJpbKZRab1haPCWDvuZkS2U= | ||
24 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= | 26 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= |
25 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= | 27 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= |
26 | github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= | 28 | github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= |
... | @@ -60,6 +62,14 @@ github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6Kllzaw | ... | @@ -60,6 +62,14 @@ github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6Kllzaw |
60 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= | 62 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= |
61 | github.com/valyala/fasttemplate v1.2.1 h1:TVEnxayobAdVkhQfrfes2IzOB6o+z4roRkPF52WA1u4= | 63 | github.com/valyala/fasttemplate v1.2.1 h1:TVEnxayobAdVkhQfrfes2IzOB6o+z4roRkPF52WA1u4= |
62 | github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= | 64 | github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= |
65 | +github.com/ysmood/goob v0.4.0 h1:HsxXhyLBeGzWXnqVKtmT9qM7EuVs/XOgkX7T6r1o1AQ= | ||
66 | +github.com/ysmood/goob v0.4.0/go.mod h1:u6yx7ZhS4Exf2MwciFr6nIM8knHQIE22lFpWHnfql18= | ||
67 | +github.com/ysmood/got v0.29.1/go.mod h1:pE1l4LOwOBhQg6A/8IAatkGp7uZjnalzrZolnlhhMgY= | ||
68 | +github.com/ysmood/gotrace v0.6.0/go.mod h1:TzhIG7nHDry5//eYZDYcTzuJLYQIkykJzCRIo4/dzQM= | ||
69 | +github.com/ysmood/gson v0.7.1 h1:zKL2MTGtynxdBdlZjyGsvEOZ7dkxaY5TH6QhAbTgz0Q= | ||
70 | +github.com/ysmood/gson v0.7.1/go.mod h1:3Kzs5zDl21g5F/BlLTNcuAGAYLKt2lV5G8D1zF3RNmg= | ||
71 | +github.com/ysmood/leakless v0.7.0 h1:XCGdaPExyoreoQd+H5qgxM3ReNbSPFsEXpSKwbXbwQw= | ||
72 | +github.com/ysmood/leakless v0.7.0/go.mod h1:R8iAXPRaG97QJwqxs74RdwzcRHT1SWCGTNqY8q0JvMQ= | ||
63 | go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= | 73 | go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= |
64 | go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= | 74 | go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= |
65 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= | 75 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= | ... | ... |
... | @@ -12,47 +12,49 @@ import ( | ... | @@ -12,47 +12,49 @@ import ( |
12 | "net/url" | 12 | "net/url" |
13 | "strconv" | 13 | "strconv" |
14 | "strings" | 14 | "strings" |
15 | + "sync" | ||
15 | "time" | 16 | "time" |
16 | 17 | ||
17 | "github.com/PuerkitoBio/goquery" | 18 | "github.com/PuerkitoBio/goquery" |
18 | - "github.com/fedesog/webdriver" | 19 | + "github.com/go-rod/rod" |
19 | ) | 20 | ) |
20 | 21 | ||
21 | func GetItemByKeyword(keyword string) ([]model.Item, error) { | 22 | func GetItemByKeyword(keyword string) ([]model.Item, error) { |
22 | var items []model.Item | 23 | var items []model.Item |
24 | + wg := sync.WaitGroup{} | ||
25 | + | ||
26 | + itemsInfo, err := getItemsInfoByKeyword(keyword) | ||
27 | + if err != nil { | ||
28 | + return nil, err | ||
29 | + } | ||
23 | 30 | ||
24 | - itemsInfo := getItemsInfoByKeyword(keyword) | ||
25 | for _, itemInfo := range itemsInfo { | 31 | for _, itemInfo := range itemsInfo { |
26 | - if itemInfo.CafeName != "중고나라" { | ||
27 | - continue | ||
28 | - } | ||
29 | itemUrl := itemInfo.Link | 32 | itemUrl := itemInfo.Link |
30 | - sold, price, thumbnailUrl, extraInfo := crawlingNaverCafe(itemUrl) | 33 | + if itemInfo.CafeName != "중고나라" { |
31 | - | ||
32 | - if sold == "판매 완료" { | ||
33 | continue | 34 | continue |
34 | } | 35 | } |
35 | - | 36 | + wg.Add(1) |
36 | - item := model.Item{ | 37 | + go func(itemUrl string) { |
37 | - Platform: "중고나라", | 38 | + defer wg.Done() |
38 | - Name: itemInfo.Title, | 39 | + item, err := crawlingNaverCafe(itemUrl) |
39 | - Price: price, | 40 | + if err != nil { |
40 | - ThumbnailUrl: thumbnailUrl, | 41 | + log.Fatal(err) |
41 | - ItemUrl: itemUrl, | 42 | + } |
42 | - ExtraInfo: extraInfo, | 43 | + items = append(items, *item) |
43 | - } | 44 | + }(itemUrl) |
44 | - items = append(items, item) | ||
45 | } | 45 | } |
46 | + wg.Wait() | ||
47 | + | ||
46 | return items, nil | 48 | return items, nil |
47 | } | 49 | } |
48 | 50 | ||
49 | -func getItemsInfoByKeyword(keyword string) []model.ApiResponseItem { | 51 | +func getItemsInfoByKeyword(keyword string) ([]model.ApiResponseItem, error) { |
50 | encText := url.QueryEscape("중고나라 " + keyword + " 판매중") | 52 | encText := url.QueryEscape("중고나라 " + keyword + " 판매중") |
51 | apiUrl := "https://openapi.naver.com/v1/search/cafearticle.json?query=" + encText + "&sort=sim" | 53 | apiUrl := "https://openapi.naver.com/v1/search/cafearticle.json?query=" + encText + "&sort=sim" |
52 | 54 | ||
53 | req, err := http.NewRequest("GET", apiUrl, nil) | 55 | req, err := http.NewRequest("GET", apiUrl, nil) |
54 | if err != nil { | 56 | if err != nil { |
55 | - log.Fatal(err) | 57 | + return nil, err |
56 | } | 58 | } |
57 | req.Header.Add("X-Naver-Client-Id", config.Cfg.Secret.CLIENTID) | 59 | req.Header.Add("X-Naver-Client-Id", config.Cfg.Secret.CLIENTID) |
58 | req.Header.Add("X-Naver-Client-Secret", config.Cfg.Secret.CLIENTSECRET) | 60 | req.Header.Add("X-Naver-Client-Secret", config.Cfg.Secret.CLIENTSECRET) |
... | @@ -60,7 +62,7 @@ func getItemsInfoByKeyword(keyword string) []model.ApiResponseItem { | ... | @@ -60,7 +62,7 @@ func getItemsInfoByKeyword(keyword string) []model.ApiResponseItem { |
60 | client := &http.Client{} | 62 | client := &http.Client{} |
61 | resp, err := client.Do(req) | 63 | resp, err := client.Do(req) |
62 | if err != nil { | 64 | if err != nil { |
63 | - log.Fatal(err) | 65 | + return nil, err |
64 | } | 66 | } |
65 | defer func(Body io.ReadCloser) { | 67 | defer func(Body io.ReadCloser) { |
66 | err := Body.Close() | 68 | err := Body.Close() |
... | @@ -75,52 +77,48 @@ func getItemsInfoByKeyword(keyword string) []model.ApiResponseItem { | ... | @@ -75,52 +77,48 @@ func getItemsInfoByKeyword(keyword string) []model.ApiResponseItem { |
75 | if err != nil { | 77 | if err != nil { |
76 | log.Fatal(err) | 78 | log.Fatal(err) |
77 | } | 79 | } |
78 | - return apiResponse.Items | 80 | + return apiResponse.Items, nil |
79 | } | 81 | } |
80 | 82 | ||
81 | -func crawlingNaverCafe(cafeUrl string) (string, int, string, string) { | 83 | +func crawlingNaverCafe(cafeUrl string) (*model.Item, error) { |
82 | - driver := webdriver.NewChromeDriver("./chromedriver") | 84 | + frame := rod.New().MustConnect().MustPage(cafeUrl).MustElement("iframe#cafe_main") |
83 | - err := driver.Start() | 85 | + time.Sleep(time.Second * 2) |
84 | - if err != nil { | 86 | + source := frame.MustFrame().MustHTML() |
85 | - log.Println(err) | 87 | + html, err := goquery.NewDocumentFromReader(bytes.NewReader([]byte(source))) |
86 | - } | ||
87 | - desired := webdriver.Capabilities{"Platform": "Linux"} | ||
88 | - required := webdriver.Capabilities{} | ||
89 | - session, err := driver.NewSession(desired, required) | ||
90 | - if err != nil { | ||
91 | - log.Println(err) | ||
92 | - } | ||
93 | - err = session.Url(cafeUrl) | ||
94 | - if err != nil { | ||
95 | - log.Println(err) | ||
96 | - } | ||
97 | - time.Sleep(time.Second * 1) | ||
98 | - err = session.FocusOnFrame("cafe_main") | ||
99 | - if err != nil { | ||
100 | - log.Fatal(err) | ||
101 | - } | ||
102 | - resp, err := session.Source() | ||
103 | - | ||
104 | - html, err := goquery.NewDocumentFromReader(bytes.NewReader([]byte(resp))) | ||
105 | if err != nil { | 88 | if err != nil { |
106 | - log.Fatal(err) | 89 | + return nil, err |
107 | } | 90 | } |
108 | 91 | ||
92 | + title := html.Find("h3.title_text").Text() | ||
109 | sold := html.Find("div.sold_area").Text() | 93 | sold := html.Find("div.sold_area").Text() |
110 | price := priceStringToInt(html.Find(".ProductPrice").Text()) | 94 | price := priceStringToInt(html.Find(".ProductPrice").Text()) |
111 | thumbnailUrl, _ := html.Find("div.product_thumb img").Attr("src") | 95 | thumbnailUrl, _ := html.Find("div.product_thumb img").Attr("src") |
112 | extraInfo := html.Find(".se-module-text").Text() | 96 | extraInfo := html.Find(".se-module-text").Text() |
113 | 97 | ||
98 | + title = strings.TrimSpace(title) | ||
114 | sold = strings.TrimSpace(sold) | 99 | sold = strings.TrimSpace(sold) |
115 | thumbnailUrl = strings.TrimSpace(thumbnailUrl) | 100 | thumbnailUrl = strings.TrimSpace(thumbnailUrl) |
116 | extraInfo = strings.TrimSpace(extraInfo) | 101 | extraInfo = strings.TrimSpace(extraInfo) |
117 | 102 | ||
118 | - return sold, price, thumbnailUrl, extraInfo | 103 | + item := model.Item{ |
104 | + Platform: "중고나라", | ||
105 | + Name: title, | ||
106 | + Price: price, | ||
107 | + ThumbnailUrl: thumbnailUrl, | ||
108 | + ItemUrl: cafeUrl, | ||
109 | + ExtraInfo: extraInfo, | ||
110 | + } | ||
111 | + | ||
112 | + return &item, nil | ||
119 | } | 113 | } |
120 | 114 | ||
121 | func priceStringToInt(priceString string) int { | 115 | func priceStringToInt(priceString string) int { |
122 | strings.TrimSpace(priceString) | 116 | strings.TrimSpace(priceString) |
123 | 117 | ||
118 | + if priceString == "" { | ||
119 | + return 0 | ||
120 | + } | ||
121 | + | ||
124 | priceString = strings.ReplaceAll(priceString, "원", "") | 122 | priceString = strings.ReplaceAll(priceString, "원", "") |
125 | priceString = strings.ReplaceAll(priceString, ",", "") | 123 | priceString = strings.ReplaceAll(priceString, ",", "") |
126 | 124 | ... | ... |
-
Please register or login to post a comment