Showing
4 changed files
with
276 additions
and
0 deletions
Youtube/.gitignore
0 → 100644
1 | +# Byte-compiled / optimized / DLL files | ||
2 | +__pycache__/ | ||
3 | +*.py[cod] | ||
4 | + | ||
5 | +# C extensions | ||
6 | +*.so | ||
7 | + | ||
8 | +# Distribution / packaging | ||
9 | +.Python | ||
10 | +env/ | ||
11 | +build/ | ||
12 | +develop-eggs/ | ||
13 | +dist/ | ||
14 | +downloads/ | ||
15 | +eggs/ | ||
16 | +.eggs/ | ||
17 | +lib/ | ||
18 | +lib64/ | ||
19 | +parts/ | ||
20 | +sdist/ | ||
21 | +var/ | ||
22 | +*.egg-info/ | ||
23 | +.installed.cfg | ||
24 | +*.egg | ||
25 | + | ||
26 | +# PyInstaller | ||
27 | +# Usually these files are written by a python script from a template | ||
28 | +# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
29 | +*.manifest | ||
30 | +*.spec | ||
31 | + | ||
32 | +# Installer logs | ||
33 | +pip-log.txt | ||
34 | +pip-delete-this-directory.txt | ||
35 | + | ||
36 | +# Unit test / coverage reports | ||
37 | +htmlcov/ | ||
38 | +.tox/ | ||
39 | +.coverage | ||
40 | +.coverage.* | ||
41 | +.cache | ||
42 | +nosetests.xml | ||
43 | +coverage.xml | ||
44 | +*,cover | ||
45 | + | ||
46 | +# Translations | ||
47 | +*.mo | ||
48 | +*.pot | ||
49 | + | ||
50 | +# Django stuff: | ||
51 | +*.log | ||
52 | + | ||
53 | +# Sphinx documentation | ||
54 | +docs/_build/ | ||
55 | + | ||
56 | +# PyBuilder | ||
57 | +target/ |
Youtube/LICENSE
0 → 100644
1 | +The MIT License (MIT) | ||
2 | + | ||
3 | +Copyright (c) 2015 Egbert Bouman | ||
4 | + | ||
5 | +Permission is hereby granted, free of charge, to any person obtaining a copy | ||
6 | +of this software and associated documentation files (the "Software"), to deal | ||
7 | +in the Software without restriction, including without limitation the rights | ||
8 | +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
9 | +copies of the Software, and to permit persons to whom the Software is | ||
10 | +furnished to do so, subject to the following conditions: | ||
11 | + | ||
12 | +The above copyright notice and this permission notice shall be included in all | ||
13 | +copies or substantial portions of the Software. | ||
14 | + | ||
15 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
16 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
17 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
18 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
19 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
20 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
21 | +SOFTWARE. | ||
22 | + |
Youtube/README.md
0 → 100644
1 | +# youtube-comment-downloader | ||
2 | +Simple script for downloading Youtube comments without using the Youtube API. The output is in line delimited JSON. | ||
3 | + | ||
4 | +### Dependencies | ||
5 | +* Python 2.7+ | ||
6 | +* requests | ||
7 | +* lxml | ||
8 | +* cssselect | ||
9 | + | ||
10 | +The python packages can be installed with | ||
11 | + | ||
12 | + pip install requests | ||
13 | + pip install lxml | ||
14 | + pip install cssselect | ||
15 | + | ||
16 | +### Usage | ||
17 | +``` | ||
18 | +usage: downloader.py [--help] [--youtubeid YOUTUBEID] [--output OUTPUT] | ||
19 | + | ||
20 | +Download Youtube comments without using the Youtube API | ||
21 | + | ||
22 | +optional arguments: | ||
23 | + --help, -h Show this help message and exit | ||
24 | + --youtubeid YOUTUBEID, -y YOUTUBEID | ||
25 | + ID of Youtube video for which to download the comments | ||
26 | + --output OUTPUT, -o OUTPUT | ||
27 | + Output filename (output format is line delimited JSON) | ||
28 | +``` |
Youtube/downloader.py
0 → 100644
1 | +#!/usr/bin/env python | ||
2 | + | ||
3 | +from __future__ import print_function | ||
4 | + | ||
5 | +import os | ||
6 | +import sys | ||
7 | +import time | ||
8 | +import json | ||
9 | +import requests | ||
10 | +import argparse | ||
11 | +import lxml.html | ||
12 | +import io | ||
13 | + | ||
14 | +from lxml.cssselect import CSSSelector | ||
15 | + | ||
16 | +YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}' | ||
17 | +YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax' | ||
18 | + | ||
19 | +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' | ||
20 | + | ||
21 | + | ||
22 | +def find_value(html, key, num_chars=2): | ||
23 | + pos_begin = html.find(key) + len(key) + num_chars | ||
24 | + pos_end = html.find('"', pos_begin) | ||
25 | + return html[pos_begin: pos_end] | ||
26 | + | ||
27 | + | ||
28 | +def extract_comments(html): | ||
29 | + tree = lxml.html.fromstring(html) | ||
30 | + item_sel = CSSSelector('.comment-item') | ||
31 | + text_sel = CSSSelector('.comment-text-content') | ||
32 | + time_sel = CSSSelector('.time') | ||
33 | + author_sel = CSSSelector('.user-name') | ||
34 | + | ||
35 | + for item in item_sel(tree): | ||
36 | + yield {'cid': item.get('data-cid'), | ||
37 | + 'text': text_sel(item)[0].text_content(), | ||
38 | + 'time': time_sel(item)[0].text_content().strip(), | ||
39 | + 'author': author_sel(item)[0].text_content()} | ||
40 | + | ||
41 | + | ||
42 | +def extract_reply_cids(html): | ||
43 | + tree = lxml.html.fromstring(html) | ||
44 | + sel = CSSSelector('.comment-replies-header > .load-comments') | ||
45 | + return [i.get('data-cid') for i in sel(tree)] | ||
46 | + | ||
47 | + | ||
48 | +def ajax_request(session, url, params, data, retries=10, sleep=20): | ||
49 | + for _ in range(retries): | ||
50 | + response = session.post(url, params=params, data=data) | ||
51 | + if response.status_code == 200: | ||
52 | + response_dict = json.loads(response.text) | ||
53 | + return response_dict.get('page_token', None), response_dict['html_content'] | ||
54 | + else: | ||
55 | + time.sleep(sleep) | ||
56 | + | ||
57 | + | ||
58 | +def download_comments(youtube_id, sleep=1): | ||
59 | + session = requests.Session() | ||
60 | + session.headers['User-Agent'] = USER_AGENT | ||
61 | + | ||
62 | + # Get Youtube page with initial comments | ||
63 | + response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id)) | ||
64 | + html = response.text | ||
65 | + reply_cids = extract_reply_cids(html) | ||
66 | + | ||
67 | + ret_cids = [] | ||
68 | + for comment in extract_comments(html): | ||
69 | + ret_cids.append(comment['cid']) | ||
70 | + yield comment | ||
71 | + | ||
72 | + page_token = find_value(html, 'data-token') | ||
73 | + session_token = find_value(html, 'XSRF_TOKEN', 4) | ||
74 | + | ||
75 | + first_iteration = True | ||
76 | + | ||
77 | + # Get remaining comments (the same as pressing the 'Show more' button) | ||
78 | + while page_token: | ||
79 | + data = {'video_id': youtube_id, | ||
80 | + 'session_token': session_token} | ||
81 | + | ||
82 | + params = {'action_load_comments': 1, | ||
83 | + 'order_by_time': True, | ||
84 | + 'filter': youtube_id} | ||
85 | + | ||
86 | + if first_iteration: | ||
87 | + params['order_menu'] = True | ||
88 | + else: | ||
89 | + data['page_token'] = page_token | ||
90 | + | ||
91 | + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | ||
92 | + if not response: | ||
93 | + break | ||
94 | + | ||
95 | + page_token, html = response | ||
96 | + | ||
97 | + reply_cids += extract_reply_cids(html) | ||
98 | + for comment in extract_comments(html): | ||
99 | + if comment['cid'] not in ret_cids: | ||
100 | + ret_cids.append(comment['cid']) | ||
101 | + yield comment | ||
102 | + | ||
103 | + first_iteration = False | ||
104 | + time.sleep(sleep) | ||
105 | + | ||
106 | + # Get replies (the same as pressing the 'View all X replies' link) | ||
107 | + for cid in reply_cids: | ||
108 | + data = {'comment_id': cid, | ||
109 | + 'video_id': youtube_id, | ||
110 | + 'can_reply': 1, | ||
111 | + 'session_token': session_token} | ||
112 | + | ||
113 | + params = {'action_load_replies': 1, | ||
114 | + 'order_by_time': True, | ||
115 | + 'filter': youtube_id, | ||
116 | + 'tab': 'inbox'} | ||
117 | + | ||
118 | + response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | ||
119 | + if not response: | ||
120 | + break | ||
121 | + | ||
122 | + _, html = response | ||
123 | + | ||
124 | + for comment in extract_comments(html): | ||
125 | + if comment['cid'] not in ret_cids: | ||
126 | + ret_cids.append(comment['cid']) | ||
127 | + yield comment | ||
128 | + time.sleep(sleep) | ||
129 | + | ||
130 | + | ||
131 | +def main(argv): | ||
132 | + parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API')) | ||
133 | + parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit') | ||
134 | + parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments') | ||
135 | + parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)') | ||
136 | + parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments') | ||
137 | + | ||
138 | + try: | ||
139 | + args = parser.parse_args(argv) | ||
140 | + | ||
141 | + youtube_id = args.youtubeid | ||
142 | + output = args.output | ||
143 | + limit = args.limit | ||
144 | + | ||
145 | + if not youtube_id or not output: | ||
146 | + parser.print_usage() | ||
147 | + raise ValueError('you need to specify a Youtube ID and an output filename') | ||
148 | + | ||
149 | + print('Downloading Youtube comments for video:', youtube_id) | ||
150 | + count = 0 | ||
151 | + with io.open(output, 'w', encoding='utf8') as fp: | ||
152 | + for comment in download_comments(youtube_id): | ||
153 | + comment_json = json.dumps(comment, ensure_ascii=False) | ||
154 | + print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp) | ||
155 | + count += 1 | ||
156 | + sys.stdout.write('Downloaded %d comment(s)\r' % count) | ||
157 | + sys.stdout.flush() | ||
158 | + if limit and count >= limit: | ||
159 | + break | ||
160 | + print('\nDone!') | ||
161 | + | ||
162 | + | ||
163 | + except Exception as e: | ||
164 | + print('Error:', str(e)) | ||
165 | + sys.exit(1) | ||
166 | + | ||
167 | + | ||
168 | +if __name__ == "__main__": | ||
169 | + main(sys.argv[1:]) |
-
Please register or login to post a comment