Last active
November 2, 2019 03:31
-
-
Save weaming/ccff0f885160516840a6b90640cda3ed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import json | |
import requests | |
from json_api.errors import ExceptionWithStatusCode | |
UA = ( | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) ' | |
'Mobile/15E148 MicroMessenger/7.0.8(0x17000820) NetType/WIFI Language/zh_CN' | |
) | |
def unesacpe(url): | |
return url.replace(r"\/", "/") | |
def grep_urls(html): | |
html = html.encode().decode('unicode-escape') | |
regex = re.compile( | |
r"((https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|])", | |
re.I | re.S, | |
) | |
urls = set(unesacpe(x[0]) for x in regex.findall(html)) | |
return sorted(urls) | |
def uniq(lst): | |
return sorted(filter(lambda x: x and x.strip(), set(lst))) | |
def clean_url(url: str): | |
if not url.startswith("http"): | |
urls = grep_urls(url) | |
url = urls and urls[0] | |
if not isinstance(url, str) or not str(url).startswith("http"): | |
raise ExceptionWithStatusCode("invalid url", status=400) | |
return url | |
def http_get_url(url): | |
resp = requests.get(url, headers={'User-Agent': UA}) | |
return resp.status_code, resp.text | |
def get_image_urls_via_instagram_graphql(url): | |
url = clean_url(url) | |
if "?" in url: | |
url += "&__a=1" | |
else: | |
url += "?__a=1" | |
status, html = http_get_url(url) | |
if status != 200: | |
raise ExceptionWithStatusCode( | |
f'get url content failed, status code {status}', status=status | |
) | |
fn_urls = lambda html: list( | |
get_by_key(json.loads(html), ["video_url", "display_url"]) | |
) | |
return {'urls': uniq(fn_urls(html))} | |
def get_by_key(data, keys): | |
""" | |
test links: | |
2 vidoes: https://www.instagram.com/p/B3RAsfZAww6/ | |
2 images: https://www.instagram.com/p/B3Ob1HugPge/ | |
""" | |
if isinstance(data, dict): | |
if data.get("__typename") in ["GraphVideo", "GraphImage"]: | |
for key in keys: | |
if key in data: | |
yield data[key] | |
break | |
for _, v in data.items(): | |
yield from get_by_key(v, keys) | |
elif isinstance(data, list): | |
for x in data: | |
yield from get_by_key(x, keys) | |
else: | |
return | |
print(get_image_urls_via_instagram_graphql('https://www.instagram.com/p/B3Ob1HugPge/')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment