Skip to content

Instantly share code, notes, and snippets.

@weaming
Last active November 2, 2019 03:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save weaming/ccff0f885160516840a6b90640cda3ed to your computer and use it in GitHub Desktop.
Save weaming/ccff0f885160516840a6b90640cda3ed to your computer and use it in GitHub Desktop.
import re
import json
import requests
from json_api.errors import ExceptionWithStatusCode
UA = (
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '
'Mobile/15E148 MicroMessenger/7.0.8(0x17000820) NetType/WIFI Language/zh_CN'
)
def unesacpe(url):
return url.replace(r"\/", "/")
def grep_urls(html):
html = html.encode().decode('unicode-escape')
regex = re.compile(
r"((https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|])",
re.I | re.S,
)
urls = set(unesacpe(x[0]) for x in regex.findall(html))
return sorted(urls)
def uniq(lst):
return sorted(filter(lambda x: x and x.strip(), set(lst)))
def clean_url(url: str):
if not url.startswith("http"):
urls = grep_urls(url)
url = urls and urls[0]
if not isinstance(url, str) or not str(url).startswith("http"):
raise ExceptionWithStatusCode("invalid url", status=400)
return url
def http_get_url(url):
resp = requests.get(url, headers={'User-Agent': UA})
return resp.status_code, resp.text
def get_image_urls_via_instagram_graphql(url):
url = clean_url(url)
if "?" in url:
url += "&__a=1"
else:
url += "?__a=1"
status, html = http_get_url(url)
if status != 200:
raise ExceptionWithStatusCode(
f'get url content failed, status code {status}', status=status
)
fn_urls = lambda html: list(
get_by_key(json.loads(html), ["video_url", "display_url"])
)
return {'urls': uniq(fn_urls(html))}
def get_by_key(data, keys):
"""
test links:
2 vidoes: https://www.instagram.com/p/B3RAsfZAww6/
2 images: https://www.instagram.com/p/B3Ob1HugPge/
"""
if isinstance(data, dict):
if data.get("__typename") in ["GraphVideo", "GraphImage"]:
for key in keys:
if key in data:
yield data[key]
break
for _, v in data.items():
yield from get_by_key(v, keys)
elif isinstance(data, list):
for x in data:
yield from get_by_key(x, keys)
else:
return
print(get_image_urls_via_instagram_graphql('https://www.instagram.com/p/B3Ob1HugPge/'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment