Youtube Comment Threads Downloading

김건
Commit afa0a7f744484548ed5892af4c4b50af4e369661 afa0a7f7 1 parent d48ce4be
Showing 4 changed files with 276 additions and 0 deletions
Youtube/.gitignore
Youtube/LICENSE
Youtube/README.md
Youtube/downloader.py
--- a/Youtube/.gitignore 0 → 100644
View file @afa0a7f
+++ b/Youtube/.gitignore 0 → 100644
View file @afa0a7f
+ # Byte-compiled / optimized / DLL files
+ __pycache__/
+ *.py[cod]
+ 
+ # C extensions
+ *.so
+ 
+ # Distribution / packaging
+ .Python
+ env/
+ build/
+ develop-eggs/
+ dist/
+ downloads/
+ eggs/
+ .eggs/
+ lib/
+ lib64/
+ parts/
+ sdist/
+ var/
+ *.egg-info/
+ .installed.cfg
+ *.egg
+ 
+ # PyInstaller
+ #  Usually these files are written by a python script from a template
+ #  before PyInstaller builds the exe, so as to inject date/other infos into it.
+ *.manifest
+ *.spec
+ 
+ # Installer logs
+ pip-log.txt
+ pip-delete-this-directory.txt
+ 
+ # Unit test / coverage reports
+ htmlcov/
+ .tox/
+ .coverage
+ .coverage.*
+ .cache
+ nosetests.xml
+ coverage.xml
+ *,cover
+ 
+ # Translations
+ *.mo
+ *.pot
+ 
+ # Django stuff:
+ *.log
+ 
+ # Sphinx documentation
+ docs/_build/
+ 
+ # PyBuilder
+ target/
--- a/Youtube/LICENSE 0 → 100644
View file @afa0a7f
+++ b/Youtube/LICENSE 0 → 100644
View file @afa0a7f
+ The MIT License (MIT)
+ 
+ Copyright (c) 2015 Egbert Bouman
+ 
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ 
+ The above copyright notice and this permission notice shall be included in all
+ copies or substantial portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ 
--- a/Youtube/README.md 0 → 100644
View file @afa0a7f
+++ b/Youtube/README.md 0 → 100644
View file @afa0a7f
+ # youtube-comment-downloader
+ Simple script for downloading Youtube comments without using the Youtube API. The output is in line delimited JSON.
+ 
+ ### Dependencies
+ * Python 2.7+
+ * requests
+ * lxml
+ * cssselect
+ 
+ The python packages can be installed with
+ 
+     pip install requests
+     pip install lxml
+     pip install cssselect
+ 
+ ### Usage
+ ```
+ usage: downloader.py [--help] [--youtubeid YOUTUBEID] [--output OUTPUT]
+ 
+ Download Youtube comments without using the Youtube API
+ 
+ optional arguments:
+   --help, -h            Show this help message and exit
+   --youtubeid YOUTUBEID, -y YOUTUBEID
+                         ID of Youtube video for which to download the comments
+   --output OUTPUT, -o OUTPUT
+                         Output filename (output format is line delimited JSON)
+ ```
--- a/Youtube/downloader.py 0 → 100644
View file @afa0a7f
+++ b/Youtube/downloader.py 0 → 100644
View file @afa0a7f
+ #!/usr/bin/env python
+ 
+ from __future__ import print_function
+ 
+ import os
+ import sys
+ import time
+ import json
+ import requests
+ import argparse
+ import lxml.html
+ import io
+ 
+ from lxml.cssselect import CSSSelector
+ 
+ YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
+ YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
+ 
+ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
+ 
+ 
+ def find_value(html, key, num_chars=2):
+     pos_begin = html.find(key) + len(key) + num_chars
+     pos_end = html.find('"', pos_begin)
+     return html[pos_begin: pos_end]
+ 
+ 
+ def extract_comments(html):
+     tree = lxml.html.fromstring(html)
+     item_sel = CSSSelector('.comment-item')
+     text_sel = CSSSelector('.comment-text-content')
+     time_sel = CSSSelector('.time')
+     author_sel = CSSSelector('.user-name')
+ 
+     for item in item_sel(tree):
+         yield {'cid': item.get('data-cid'),
+                'text': text_sel(item)[0].text_content(),
+                'time': time_sel(item)[0].text_content().strip(),
+                'author': author_sel(item)[0].text_content()}
+ 
+ 
+ def extract_reply_cids(html):
+     tree = lxml.html.fromstring(html)
+     sel = CSSSelector('.comment-replies-header > .load-comments')
+     return [i.get('data-cid') for i in sel(tree)]
+ 
+ 
+ def ajax_request(session, url, params, data, retries=10, sleep=20):
+     for _ in range(retries):
+         response = session.post(url, params=params, data=data)
+         if response.status_code == 200:
+             response_dict = json.loads(response.text)
+             return response_dict.get('page_token', None), response_dict['html_content']
+         else:
+             time.sleep(sleep)
+ 
+ 
+ def download_comments(youtube_id, sleep=1):
+     session = requests.Session()
+     session.headers['User-Agent'] = USER_AGENT
+ 
+     # Get Youtube page with initial comments
+     response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
+     html = response.text
+     reply_cids = extract_reply_cids(html)
+ 
+     ret_cids = []
+     for comment in extract_comments(html):
+         ret_cids.append(comment['cid'])
+         yield comment
+ 
+     page_token = find_value(html, 'data-token')
+     session_token = find_value(html, 'XSRF_TOKEN', 4)
+ 
+     first_iteration = True
+ 
+     # Get remaining comments (the same as pressing the 'Show more' button)
+     while page_token:
+         data = {'video_id': youtube_id,
+                 'session_token': session_token}
+ 
+         params = {'action_load_comments': 1,
+                   'order_by_time': True,
+                   'filter': youtube_id}
+ 
+         if first_iteration:
+             params['order_menu'] = True
+         else:
+             data['page_token'] = page_token
+ 
+         response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+         if not response:
+             break
+ 
+         page_token, html = response
+ 
+         reply_cids += extract_reply_cids(html)
+         for comment in extract_comments(html):
+             if comment['cid'] not in ret_cids:
+                 ret_cids.append(comment['cid'])
+                 yield comment
+ 
+         first_iteration = False
+         time.sleep(sleep)
+ 
+     # Get replies (the same as pressing the 'View all X replies' link)
+     for cid in reply_cids:
+         data = {'comment_id': cid,
+                 'video_id': youtube_id,
+                 'can_reply': 1,
+                 'session_token': session_token}
+ 
+         params = {'action_load_replies': 1,
+                   'order_by_time': True,
+                   'filter': youtube_id,
+                   'tab': 'inbox'}
+ 
+         response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
+         if not response:
+             break
+ 
+         _, html = response
+ 
+         for comment in extract_comments(html):
+             if comment['cid'] not in ret_cids:
+                 ret_cids.append(comment['cid'])
+                 yield comment
+         time.sleep(sleep)
+ 
+ 
+ def main(argv):
+     parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
+     parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
+     parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
+     parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
+     parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments')
+ 
+     try:
+         args = parser.parse_args(argv)
+ 
+         youtube_id = args.youtubeid
+         output = args.output
+         limit = args.limit
+ 
+         if not youtube_id or not output:
+             parser.print_usage()
+             raise ValueError('you need to specify a Youtube ID and an output filename')
+ 
+         print('Downloading Youtube comments for video:', youtube_id)
+         count = 0
+         with io.open(output, 'w', encoding='utf8') as fp:
+             for comment in download_comments(youtube_id):
+                 comment_json = json.dumps(comment, ensure_ascii=False)
+                 print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp)
+                 count += 1
+                 sys.stdout.write('Downloaded %d comment(s)\r' % count)
+                 sys.stdout.flush()
+                 if limit and count >= limit:
+                     break
+         print('\nDone!')
+ 
+ 
+     except Exception as e:
+         print('Error:', str(e))
+         sys.exit(1)
+ 
+ 
+ if __name__ == "__main__":
+     main(sys.argv[1:])