Skip to content

Instantly share code, notes, and snippets.

Created April 29, 2017 13:48
Show Gist options
  • Save anonymous/ac5accfa1a500dc65fcd23d0a508aa90 to your computer and use it in GitHub Desktop.
Save anonymous/ac5accfa1a500dc65fcd23d0a508aa90 to your computer and use it in GitHub Desktop.
Make YouTube playlist from posts in Facebook group
import requests
import json
import re
from more_itertools import unique_everseen
with open("urls.json", "r") as fp:
urls = json.load(fp)
# lots of regex to extract video ID from different types of YouTube links
def get_id(url):
if "youtube" in url:
if "attribution" in url:
try:
video_id = re.search("%3D(.*)\%26", url).group(1)
except Exception:
print(url)
video_id = ""
else:
try:
video_id = re.search("(\?|\&)(v|ci)=(.*)", url).group(3)
except Exception:
print(url)
video_id = ""
else:
try:
video_id = re.search("\.be\/(.*)", url).group(1)
except Exception:
print(url)
video_id = ""
return video_id
# get video IDs from URLs, then deduplicate
ids = list(map(get_id, urls))
ids = list(filter(lambda x: len(x) > 0, ids))
ids = list(unique_everseen(ids))
# access token can be acquired from https://developers.google.com/oauthplayground/
access_token = "YOUR_ACCESS_TOKEN"
# your YouTube playlist ID can be found in the playlist URL
playlist_id = "YOUR_PLAYLIST_ID"
# correct API path
path = "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&access_token={access_token}".format(access_token=access_token)
headers={'Content-Type': 'application/json'}
# finally insert the video into your playlist
for video_id in ids:
payload = {
"snippet": {
"playlistId": playlist_id,
"resourceId": {
"videoId": video_id,
"kind": "youtube#video"
}
}
}
request = requests.post(path, data=json.dumps(payload), headers=headers)
# for confirmation of success
print(request.content)
import pandas as pd
import json
# set required likes here
required_likes = 5
all_post_data = []
filename = "all_group_post_data.json"
with open(filename, "r", encoding="utf-8") as fp:
all_post_data += json.load(fp)
one_dimension_data = []
# reshape json into 1D for use with Pandas
for post in all_post_data:
post_data = {}
try:
post_data["poster"] = post["from"]["name"]
except KeyError:
post_data["poster"] = ""
try:
post_data["message"] = post["message"]
except KeyError:
post_data["message"] = ""
try:
post_data["name"] = post["name"]
except KeyError:
post_data["name"] = ""
try:
post_data["link"] = post["link"]
except KeyError:
post_data["link"] = ""
try:
post_data["likes"] = len(post["likes"]["data"])
except KeyError:
post_data["likes"] = 0
try:
post_data["url"] = post["permalink_url"]
except KeyError:
post_data["url"] = ""
try:
post_data["time"] = post["created_time"]
except KeyError:
post_data["time"] = ""
one_dimension_data.append(post_data)
# create dataframe for easy filtering and manipulation
df = pd.DataFrame(one_dimension_data)
# all link fields must contain 'youtu'
youtube_df = df[df["link"].str.contains("youtu")]
# all posts must have required_likes or more
likes_youtube_df = youtube_df[youtube_df["likes"] >= required_likes]
urls_series = likes_youtube_df["link"]
urls = list(urls_series)
#serialise YouTube URLs
with open("urls.json", "w") as fp:
json.dump(urls, fp)
import requests
import json
# Facebook API path, group_id appears in group URL
# Personal access token can be acquired here: https://developers.facebook.com/tools/accesstoken/
path = "https://graph.facebook.com/v2.9/"
group_id = "YOUR_GROUP_ID"
group_posts_path = "{group_id}/feed/?fields=id".format(group_id=group_id)
access_token = "YOUR_ACCESS_TOKEN"
group_url = "{path}{group_posts_path}&access_token={access_token}".format(
path=path,
group_posts_path=group_posts_path,
access_token=access_token)
url_array = []
# go through all group pagination to get all URLs to scrape
def recurse_pages(url):
global url_array
response = requests.get(url)
json_content = response.json()
print("Getting next url after {url}".format(url=url))
if len(json_content["data"]) > 0:
next_url = json_content["paging"]["next"]
url_array.append(next_url)
recurse_pages(next_url)
recurse_pages(group_url)
# serialise URLs to scrape
with open("all_group_urls.json", "w") as fp:
json.dump(url_array, fp)
all_post_ids = []
# scrape all urls for post IDs
def get_ids(url):
print("\nGetting IDs for {}".format(url))
post_ids = []
response = requests.get(url)
json_content = response.json()
data = json_content["data"]
if len(data) > 0:
for post in data:
post_id = post["id"]
post_ids.append(post_id)
return post_ids
for url in url_array:
post_ids = get_ids(url)
all_post_ids += post_ids
# serialise post IDs
with open("all_group_post_ids.json", "w") as fp:
json.dump(all_post_ids, fp)
all_post_data = []
# define fields to get from posts
fields = "link,story,message,created_time,from,id,likes,name,permalink_url"
# get all post data from all above IDs
def get_post_data(post_id):
url = "https://graph.facebook.com/{post_id}?fields={fields}&access_token={access_token}".format(
post_id=post_id,
fields=fields,
access_token=access_token)
print("\nGetting data from {url}".format(url=url))
response = requests.get(url)
data = response.json()
return data
for post_id in all_post_ids:
all_post_data.append(get_post_data(post_id))
# serialise post data
with open("all_group_post_data.json", "w") as fp:
json.dump(all_post_data, fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment