Youtube Comment Threads Downloading

김건
Commit afa0a7f744484548ed5892af4c4b50af4e369661 afa0a7f7 1 parent d48ce4be
Showing 4 changed files with 276 additions and 0 deletions
Youtube/.gitignore
Youtube/LICENSE
Youtube/README.md
Youtube/downloader.py
--- a/Youtube/.gitignore 0 → 100644
View file @afa0a7f
+++ b/Youtube/.gitignore 0 → 100644
View file @afa0a7f
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
--- a/Youtube/LICENSE 0 → 100644
View file @afa0a7f
+++ b/Youtube/LICENSE 0 → 100644
View file @afa0a7f
+The MIT License (MIT)
+
+Copyright (c) 2015 Egbert Bouman
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/Youtube/README.md 0 → 100644
View file @afa0a7f
+++ b/Youtube/README.md 0 → 100644
View file @afa0a7f
+# youtube-comment-downloader
+Simple script for downloading Youtube comments without using the Youtube API. The output is in line delimited JSON.
+
+### Dependencies
+* Python 2.7+
+* requests
+* lxml
+* cssselect
+
+The python packages can be installed with
+
+    pip install requests
+    pip install lxml
+    pip install cssselect
+
+### Usage
+```
+usage: downloader.py [--help] [--youtubeid YOUTUBEID] [--output OUTPUT]
+
+Download Youtube comments without using the Youtube API
+
+optional arguments:
+  --help, -h            Show this help message and exit
+  --youtubeid YOUTUBEID, -y YOUTUBEID
+                        ID of Youtube video for which to download the comments
+  --output OUTPUT, -o OUTPUT
+                        Output filename (output format is line delimited JSON)
+```
--- a/Youtube/downloader.py 0 → 100644
View file @afa0a7f
+++ b/Youtube/downloader.py 0 → 100644
View file @afa0a7f
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import os
+import sys
+import time
+import json
+import requests
+import argparse
+import lxml.html
+import io
+
+from lxml.cssselect import CSSSelector
+
+YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
+YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
+
+USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
+
+
+def find_value(html, key, num_chars=2):
+    pos_begin = html.find(key) + len(key) + num_chars
+    pos_end = html.find('"', pos_begin)
+    return html[pos_begin: pos_end]
+
+
+def extract_comments(html):
+    tree = lxml.html.fromstring(html)
+    item_sel = CSSSelector('.comment-item')
+    text_sel = CSSSelector('.comment-text-content')
+    time_sel = CSSSelector('.time')
+    author_sel = CSSSelector('.user-name')
+
+    for item in item_sel(tree):
+        yield {'cid': item.get('data-cid'),
+               'text': text_sel(item)[0].text_content(),
+               'time': time_sel(item)[0].text_content().strip(),
+               'author': author_sel(item)[0].text_content()}
+
+
+def extract_reply_cids(html):
+    tree = lxml.html.fromstring(html)
+    sel = CSSSelector('.comment-replies-header > .load-comments')
+    return [i.get('data-cid') for i in sel(tree)]
+
+
+def ajax_request(session, url, params, data, retries=10, sleep=20):
+    for _ in range(retries):
+        response = session.post(url, params=params, data=data)
+        if response.status_code == 200:
+            response_dict = json.loads(response.text)
+            return response_dict.get('page_token', None), response_dict['html_content']
+        else:
+            time.sleep(sleep)
+
+
+def download_comments(youtube_id, sleep=1):
+    session = requests.Session()
+    session.headers['User-Agent'] = USER_AGENT
+
+    # Get Youtube page with initial comments
+    response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
+    html = response.text
+    reply_cids = extract_reply_cids(html)
+
+    ret_cids = []
+    for comment in extract_comments(html):
+        ret_cids.append(comment['cid'])
+        yield comment
+
+    page_token = find_value(html, 'data-token')
+    session_token = find_value(html, 'XSRF_TOKEN', 4)
+
+    first_iteration = True
+
+    # Get remaining comments (the same as pressing the 'Show more' button)
+    while page_token:
+        data = {'video_id': youtube_id,
+                'session_token': session_token}
+
+        params = {'action_load_comments': 1,
+                  'order_by_time': True,
+                  'filter': youtube_id}
+
+        if first_iteration:
+            params['order_menu'] = True
+        else:
+            data['page_token'] = page_token
+
+        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+        if not response:
+            break
+
+        page_token, html = response
+
+        reply_cids += extract_reply_cids(html)
+        for comment in extract_comments(html):
+            if comment['cid'] not in ret_cids:
+                ret_cids.append(comment['cid'])
+                yield comment
+
+        first_iteration = False
+        time.sleep(sleep)
+
+    # Get replies (the same as pressing the 'View all X replies' link)
+    for cid in reply_cids:
+        data = {'comment_id': cid,
+                'video_id': youtube_id,
+                'can_reply': 1,
+                'session_token': session_token}
+
+        params = {'action_load_replies': 1,
+                  'order_by_time': True,
+                  'filter': youtube_id,
+                  'tab': 'inbox'}
+
+        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+        if not response:
+            break
+
+        _, html = response
+
+        for comment in extract_comments(html):
+            if comment['cid'] not in ret_cids:
+                ret_cids.append(comment['cid'])
+                yield comment
+        time.sleep(sleep)
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
+    parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
+    parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
+    parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
+    parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments')
+
+    try:
+        args = parser.parse_args(argv)
+
+        youtube_id = args.youtubeid
+        output = args.output
+        limit = args.limit
+
+        if not youtube_id or not output:
+            parser.print_usage()
+            raise ValueError('you need to specify a Youtube ID and an output filename')
+
+        print('Downloading Youtube comments for video:', youtube_id)
+        count = 0
+        with io.open(output, 'w', encoding='utf8') as fp:
+            for comment in download_comments(youtube_id):
+                comment_json = json.dumps(comment, ensure_ascii=False)
+                print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp)
+                count += 1
+                sys.stdout.write('Downloaded %d comment(s)\r' % count)
+                sys.stdout.flush()
+                if limit and count >= limit:
+                    break
+        print('\nDone!')
+
+
+    except Exception as e:
+        print('Error:', str(e))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])