Source code for youtube_api.parsers

import json
import sys
import datetime
from collections import OrderedDict

if sys.version_info[0] == 2:
    from collections import Iterable
else:
    from collections.abc import Iterable

from youtube_api.youtube_api_utils import parse_yt_datetime

"""
This script contains the parsers for the raw json responses
from the API. Use `raw_json` to return the output as-is.
"""

__all__ = ['raw_json',
           'parse_video_metadata',
           'parse_channel_metadata',
           'parse_rec_video_metadata',
           'parse_video_url',
           'parse_subscription_descriptive',
           'parse_featured_channels',
           'parse_comment_metadata',
           'parse_playlist_metadata',
           'parse_caption_track']

[docs]def raw_json(item):
    '''
    Returns the raw json output from the API.
    '''
    return item

def raw_json_with_datetime(item):
    '''
    Returns the raw json output from the API.
    '''
    item['collection_date'] = datetime.datetime.now().strftime('%Y-%m-%d')
    return item

[docs]def parse_video_metadata(item):
    '''
    Parses and processes raw output and returns video_id, channel_title, channel_id, video_publish_date, video_title, video_description, video_category, video_view_count, video_comment_count, video_like_count, video_dislike_count, video_thumbnail, video_tags, collection_date.

    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''
    if not isinstance(item, dict):
        return dict()

    tags = item["snippet"].get('tags')
    if isinstance(tags, Iterable):
        video_tags =  '|'.join(tags)
    else:
        video_tags = ''

    video_meta = {
        "video_id" : item['id'],
        "channel_title" : item["snippet"].get("channelTitle"),
        "channel_id" : item["snippet"].get("channelId"),
        "video_publish_date" : parse_yt_datetime(item["snippet"].get("publishedAt")),
        "video_title" : item["snippet"].get("title"),
        "video_description" : item["snippet"].get("description"),
        "video_category" : item["snippet"].get("categoryId"),
        "video_view_count" : item["statistics"].get("viewCount"),
        "video_comment_count" : item["statistics"].get("commentCount"),
        "video_like_count" : item["statistics"].get("likeCount"),
        "video_dislike_count" : item["statistics"].get("dislikeCount"),
        "video_thumbnail" : item["snippet"]["thumbnails"]["high"]["url"],
        "video_tags" :  video_tags,
        "collection_date" : datetime.datetime.now()
    }

    return video_meta


[docs]def parse_video_url(item):
    '''
    Parses and processes raw output and returns publish_date, video_id, channel_id, collection_date
    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''
    if not isinstance(item, dict):
        return dict()

    publish_date = item['snippet'].get('publishedAt')
    publish_date = parse_yt_datetime(publish_date)
    video_id = item['snippet']['resourceId'].get('videoId')
    channel_id = item['snippet'].get('channelId')

    return {
        "video_id" : video_id,
        "channel_id" : channel_id,
        "publish_date" : publish_date,
        "collection_date" : datetime.datetime.now()
    }


[docs]def parse_channel_metadata(item):
    '''
    Parses and processes raw output and returns channel_id, title, account_creatation_date, keywords, description, view_count, video_count, subscription_count, playlist_id_likes, playlist_id_uploads, topic_ids, country, collection_date.
    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''
    if not isinstance(item, dict):
        return dict()

    topic = item.get('topicDetails')
    if topic:
        topic = '|'.join(topic.get('topicCategories'))

    channel_meta = {
        "channel_id" : item['id'],
        "title" : item["snippet"].get("title"),
        "account_creation_date" : parse_yt_datetime(item["snippet"].get("publishedAt")),
        "keywords" : item['brandingSettings']['channel'].get('keywords'),
        "description" : item["snippet"].get("description"),
        "view_count" : item["statistics"].get("viewCount"),
        "video_count" : item["statistics"].get("videoCount"),
        "subscription_count" : item["statistics"].get("subscriberCount"),
        "playlist_id_likes" : item['contentDetails']['relatedPlaylists'].get('likes'),
        "playlist_id_uploads" : item['contentDetails']['relatedPlaylists'].get('uploads'),
        "topic_ids" : topic,
        "country" : item['snippet'].get('country'),
        "collection_date" : datetime.datetime.now()
    }

    return channel_meta


[docs]def parse_subscription_descriptive(item):
    '''
    Parses and processes raw output and returns subscription_title, subscription_channel_id, subscription_kind, subscription_publish_date, collection_date.

    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''
    if not isinstance(item, dict):
        return dict()

    sub_meta = {
        "subscription_title" : item['snippet']['title'],
        "subscription_channel_id" : item['snippet']['resourceId'].get('channelId'),
        "subscription_kind" : item['snippet']['resourceId'].get('kind'),
        "subscription_publish_date" : parse_yt_datetime(item['snippet'].get('publishedAt')),
        "collection_date" : datetime.datetime.now()
    }

    return sub_meta


[docs]def parse_featured_channels(item):
    '''
    Parses and processes raw output and returns a dictionary where the key is the channel_id and the key is a list of channel URLs.
    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''
    if not isinstance(item, dict):
        return dict()

    d = {}
    d[item['id']] = item['brandingSettings']['channel'].get('featuredChannelsUrls', [])
    return d


[docs]def parse_playlist_metadata(item):
    '''
    Parses and processes raw output and returns playlist_name, playlist_id, playlist_publish_date, playlist_n_videos, channel_id, channel_name, collection_date.
    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict

    '''
    if not isinstance(item, dict):
        return dict()

    playlist_meta = {
        "playlist_name" : item['snippet'].get('title'),
        "playlist_id" : item['id'],
        "playlist_publish_date" : parse_yt_datetime(item['snippet'].get('publishedAt')),
        "playlist_n_videos" : item['contentDetails'].get('itemCount'),
        "channel_id" : item['snippet'].get('channelId'),
        "channel_name" : item['snippet'].get('channelTitle'),
        "collection_date" : datetime.datetime.now()
    }

    return playlist_meta


[docs]def parse_comment_metadata(item):
    '''
    Parses and processes raw output and returns video_id, commenter_channel_url,  commenter_channel_display_name, comment_id, comment_like_count, comment_publish_date, text, commenter_rating, comment_parent_id, collection_date.
    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''
    if not isinstance(item, dict):
        return dict()

    if item['snippet'].get('topLevelComment'):
        save = item['snippet']
        item = item['snippet']['topLevelComment']

    comment_meta = {
        "video_id" : item["snippet"].get("videoId"),
        "commenter_channel_url" : item["snippet"].get("authorChannelUrl"),
        "commenter_channel_id" : item['snippet'].get('authorChannelId', dict()).get('value', None),
        "commenter_channel_display_name" : item['snippet'].get('authorDisplayName'),
        "comment_id" : item.get("id"),
        "comment_like_count" : item["snippet"].get("likeCount"),
        "comment_publish_date" : parse_yt_datetime(item["snippet"].get("publishedAt")),
        "text" : item["snippet"].get("textDisplay"),
        "commenter_rating" : item["snippet"].get("viewerRating"),
        "comment_parent_id" : item["snippet"].get("parentId"),
        "collection_date" : datetime.datetime.now()
    }
    try:
        comment_meta['reply_count'] = save.get('totalReplyCount')
    except:
        comment_meta['reply_count'] = item.get('totalReplyCount')

    return comment_meta


[docs]def parse_rec_video_metadata(item):
    '''
    Parses and processes raw output and returns video_id, channel_title, channel_id, video_publish_date, video_title, video_description, video_category, video_thumbnail, collection_date.
    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''
    if not isinstance(item, dict):
        return dict()

    video_meta = {
        "video_id" : item['id'].get('videoId'),
        "channel_title" : item["snippet"].get("channelTitle"),
        "channel_id" : item["snippet"].get("channelId"),
        "video_publish_date" : parse_yt_datetime(item["snippet"].get("publishedAt")),
        "video_title" : item["snippet"].get("title"),
        "video_description" : item["snippet"].get("description"),
        "video_category" : item["snippet"].get("categoryId"),
        "video_thumbnail" : item["snippet"]["thumbnails"]["high"]["url"],
        "collection_date" : datetime.datetime.now()
    }

    return video_meta

[docs]def parse_caption_track(item):
    '''
    Returns the video_id, captions and collection_date.
    
    :params item: json document
    :type item: dict

    :returns: parsed dictionary
    :rtype: dict
    '''

    #TODO: convert known errors into an error message.

    caption_meta = {
        "video_id" : item['video_id'],
        "caption" : item['caption'],
        "collection_date" : item['collection_date']
    }

    return caption_meta