김건

Youtube Comment Threads Downloading

1 +# Byte-compiled / optimized / DLL files
2 +__pycache__/
3 +*.py[cod]
4 +
5 +# C extensions
6 +*.so
7 +
8 +# Distribution / packaging
9 +.Python
10 +env/
11 +build/
12 +develop-eggs/
13 +dist/
14 +downloads/
15 +eggs/
16 +.eggs/
17 +lib/
18 +lib64/
19 +parts/
20 +sdist/
21 +var/
22 +*.egg-info/
23 +.installed.cfg
24 +*.egg
25 +
26 +# PyInstaller
27 +# Usually these files are written by a python script from a template
28 +# before PyInstaller builds the exe, so as to inject date/other infos into it.
29 +*.manifest
30 +*.spec
31 +
32 +# Installer logs
33 +pip-log.txt
34 +pip-delete-this-directory.txt
35 +
36 +# Unit test / coverage reports
37 +htmlcov/
38 +.tox/
39 +.coverage
40 +.coverage.*
41 +.cache
42 +nosetests.xml
43 +coverage.xml
44 +*,cover
45 +
46 +# Translations
47 +*.mo
48 +*.pot
49 +
50 +# Django stuff:
51 +*.log
52 +
53 +# Sphinx documentation
54 +docs/_build/
55 +
56 +# PyBuilder
57 +target/
1 +The MIT License (MIT)
2 +
3 +Copyright (c) 2015 Egbert Bouman
4 +
5 +Permission is hereby granted, free of charge, to any person obtaining a copy
6 +of this software and associated documentation files (the "Software"), to deal
7 +in the Software without restriction, including without limitation the rights
8 +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 +copies of the Software, and to permit persons to whom the Software is
10 +furnished to do so, subject to the following conditions:
11 +
12 +The above copyright notice and this permission notice shall be included in all
13 +copies or substantial portions of the Software.
14 +
15 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 +SOFTWARE.
22 +
1 +# youtube-comment-downloader
2 +Simple script for downloading Youtube comments without using the Youtube API. The output is in line delimited JSON.
3 +
4 +### Dependencies
5 +* Python 2.7+
6 +* requests
7 +* lxml
8 +* cssselect
9 +
10 +The python packages can be installed with
11 +
12 + pip install requests
13 + pip install lxml
14 + pip install cssselect
15 +
16 +### Usage
17 +```
18 +usage: downloader.py [--help] [--youtubeid YOUTUBEID] [--output OUTPUT]
19 +
20 +Download Youtube comments without using the Youtube API
21 +
22 +optional arguments:
23 + --help, -h Show this help message and exit
24 + --youtubeid YOUTUBEID, -y YOUTUBEID
25 + ID of Youtube video for which to download the comments
26 + --output OUTPUT, -o OUTPUT
27 + Output filename (output format is line delimited JSON)
28 +```
1 +#!/usr/bin/env python
2 +
3 +from __future__ import print_function
4 +
5 +import os
6 +import sys
7 +import time
8 +import json
9 +import requests
10 +import argparse
11 +import lxml.html
12 +import io
13 +
14 +from lxml.cssselect import CSSSelector
15 +
16 +YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
17 +YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
18 +
19 +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
20 +
21 +
22 +def find_value(html, key, num_chars=2):
23 + pos_begin = html.find(key) + len(key) + num_chars
24 + pos_end = html.find('"', pos_begin)
25 + return html[pos_begin: pos_end]
26 +
27 +
28 +def extract_comments(html):
29 + tree = lxml.html.fromstring(html)
30 + item_sel = CSSSelector('.comment-item')
31 + text_sel = CSSSelector('.comment-text-content')
32 + time_sel = CSSSelector('.time')
33 + author_sel = CSSSelector('.user-name')
34 +
35 + for item in item_sel(tree):
36 + yield {'cid': item.get('data-cid'),
37 + 'text': text_sel(item)[0].text_content(),
38 + 'time': time_sel(item)[0].text_content().strip(),
39 + 'author': author_sel(item)[0].text_content()}
40 +
41 +
42 +def extract_reply_cids(html):
43 + tree = lxml.html.fromstring(html)
44 + sel = CSSSelector('.comment-replies-header > .load-comments')
45 + return [i.get('data-cid') for i in sel(tree)]
46 +
47 +
48 +def ajax_request(session, url, params, data, retries=10, sleep=20):
49 + for _ in range(retries):
50 + response = session.post(url, params=params, data=data)
51 + if response.status_code == 200:
52 + response_dict = json.loads(response.text)
53 + return response_dict.get('page_token', None), response_dict['html_content']
54 + else:
55 + time.sleep(sleep)
56 +
57 +
58 +def download_comments(youtube_id, sleep=1):
59 + session = requests.Session()
60 + session.headers['User-Agent'] = USER_AGENT
61 +
62 + # Get Youtube page with initial comments
63 + response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
64 + html = response.text
65 + reply_cids = extract_reply_cids(html)
66 +
67 + ret_cids = []
68 + for comment in extract_comments(html):
69 + ret_cids.append(comment['cid'])
70 + yield comment
71 +
72 + page_token = find_value(html, 'data-token')
73 + session_token = find_value(html, 'XSRF_TOKEN', 4)
74 +
75 + first_iteration = True
76 +
77 + # Get remaining comments (the same as pressing the 'Show more' button)
78 + while page_token:
79 + data = {'video_id': youtube_id,
80 + 'session_token': session_token}
81 +
82 + params = {'action_load_comments': 1,
83 + 'order_by_time': True,
84 + 'filter': youtube_id}
85 +
86 + if first_iteration:
87 + params['order_menu'] = True
88 + else:
89 + data['page_token'] = page_token
90 +
91 + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
92 + if not response:
93 + break
94 +
95 + page_token, html = response
96 +
97 + reply_cids += extract_reply_cids(html)
98 + for comment in extract_comments(html):
99 + if comment['cid'] not in ret_cids:
100 + ret_cids.append(comment['cid'])
101 + yield comment
102 +
103 + first_iteration = False
104 + time.sleep(sleep)
105 +
106 + # Get replies (the same as pressing the 'View all X replies' link)
107 + for cid in reply_cids:
108 + data = {'comment_id': cid,
109 + 'video_id': youtube_id,
110 + 'can_reply': 1,
111 + 'session_token': session_token}
112 +
113 + params = {'action_load_replies': 1,
114 + 'order_by_time': True,
115 + 'filter': youtube_id,
116 + 'tab': 'inbox'}
117 +
118 + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
119 + if not response:
120 + break
121 +
122 + _, html = response
123 +
124 + for comment in extract_comments(html):
125 + if comment['cid'] not in ret_cids:
126 + ret_cids.append(comment['cid'])
127 + yield comment
128 + time.sleep(sleep)
129 +
130 +
131 +def main(argv):
132 + parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
133 + parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
134 + parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
135 + parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
136 + parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments')
137 +
138 + try:
139 + args = parser.parse_args(argv)
140 +
141 + youtube_id = args.youtubeid
142 + output = args.output
143 + limit = args.limit
144 +
145 + if not youtube_id or not output:
146 + parser.print_usage()
147 + raise ValueError('you need to specify a Youtube ID and an output filename')
148 +
149 + print('Downloading Youtube comments for video:', youtube_id)
150 + count = 0
151 + with io.open(output, 'w', encoding='utf8') as fp:
152 + for comment in download_comments(youtube_id):
153 + comment_json = json.dumps(comment, ensure_ascii=False)
154 + print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp)
155 + count += 1
156 + sys.stdout.write('Downloaded %d comment(s)\r' % count)
157 + sys.stdout.flush()
158 + if limit and count >= limit:
159 + break
160 + print('\nDone!')
161 +
162 +
163 + except Exception as e:
164 + print('Error:', str(e))
165 + sys.exit(1)
166 +
167 +
168 +if __name__ == "__main__":
169 + main(sys.argv[1:])