Skip to content

Instantly share code, notes, and snippets.

@nix010
Last active February 26, 2019 09:49
Show Gist options
  • Save nix010/b8d658c00761098aecd8d1a09ea5e9d3 to your computer and use it in GitHub Desktop.
Save nix010/b8d658c00761098aecd8d1a09ea5e9d3 to your computer and use it in GitHub Desktop.
Crawl pictures from Pinterest by search a keyword | 26 Jan, 2018 (TESED )
from bs4 import BeautifulSoup as BS
import requests
class BaseCrawler(object):
api_url = None
default_headers = {
'Accept-Language' :'en-US,en,q=0.9,vi;q=0.8',
'Cache-Control' :'no-cache',
'Connection' :'keep-alive',
'Content-Type' :'application/json',
'Accept' :'*/*',
'User-Agent' :'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36'
}
def __init__(self,email,password,user_id,**kwargs):
from django.contrib.auth.models import User
self.r = requests.Session()
def _get(self,url,params=None,headers=None,cookies=None):
if params is None:
params = {}
if cookies is None:
cookies = {}
h=self.default_headers
if headers:
h.update(headers)
return self.r.get(url,params=params,headers=h,cookies=cookies)
def _post(self,url,params=None,data=None,headers=None):
h=self.default_headers
if headers:
h.update(headers)
return self.r.post(url,data=data,headers=h)
def save_data_to_db(self):
pass
def crawl_now(self):
r = self.call_request()
self.parse_response_data(r)
self.save_data_to_db()
pass
def call_request(self):
pass
def parse_response_data(self,response):
pass
import json
from core.crawlers.base_crawler import BaseCrawler # Just some helpers to call API
class PinterestCrawler(BaseCrawler):
api_url = 'https://www.pinterest.com/resource/SearchResource/get/'
default_headers = dict({
'X-Requested-With' : 'XMLHttpRequest',
'X-Pinterest-AppState' : 'active',
},**BaseCrawler.default_headers)
def __init__(self,keyword):
self.r = requests.Session()
self.keyword = keyword
self.params = {
'source_url' : '/search/pins/?q=%s' % keyword,
'rs' : 'typed',
'data' : json.dumps({
'options' : {
'bookmarks' :['Y2JVSG81V2sxcmNHRlpWM1J5VFVad1YxWlVSbGhXTVZwSlZGWlZNVlV3TVZkalJFSlhUVzVTVkZWWGN6RldNa3BKVW14S1YxSnNjR2hYVm1ONFpXc3hWMVZ1U2xaaE0wSnpWVzAxUTJWR1pIRlVibVJXVW10d1NGa3dhRWRXVjBWNFUyeFNXbFpGV2pOV01GcExWMWRPUms5V1pGTmhNMEl5Vm1wSmQyVkdUblJXYkdScVVsWmFWMWxzYUVOaFJteFlaRVphYkdKSFVucFhhMXAzVkRGS2RHVkVRbGRXZWtJMFZrZDRXbVF4V2xWUmJGWnBWa1ZhV1ZkV1ZtRmtNVTVIVm14c1lWSlViRlJWYWtwUFRrWmFTR1ZHVGxWTmEzQlhWREZhVjFWc1pFaFVWR3hRWWtaYVNsbHVjRk5pUmtsNFkwVmFWazFxUm5wV1IzaEtaVVprZFZGc1ZtbFNNVXBOVjFaV1ZrMVdaRWRVYmxKT1ZqQmFXRlZ0ZEhkTlJscEZVbXhPYW1GNlZsZFVNVlpYVmtaa1NWRnNSbGRoTVhCSFZGWmFVMVpzY0VkVGJYaFRWa1phU2xaVVNYZGxSbEp6VjJ0YVYyRnNXbGxaYTFwTFVURndXR042VmxSU2EzQXdXVlZWTVdKSFJYZGpTR2hYVFc1U1ZGVnFTa2RXTWs1SFZteGFWMUpyY0ZKV1YzUnJWVEpPYzFWdVVtcFNWWEJ6V1Zod1IyVkdWbk5WYTA1WVlYcEdlVlJWVWtkV1YwWnlZMFpDV21KR1ZqUmFSVnBoVmxVeFJVMVVhR0ZoYTJ0M1ZGVmtVbVZHY0VoU2JURk9aVzF6ZDFkWE1WWk5SVFZGV2tkNFlXRnNSVEZVVmxKYVpWWndWV0pIYUdGaVZtdDRWMnhTYjJGc2NIRmhSM1JhWld4V00xUlZVbk5pUlRWeFZGUk9ZVlpHUlhsVU1HUkxZV3N4TmxGVVFrNVNSVlV3Vkd0U1QySkZNWEZYYldoYVZWUXdPV1pIVW1wWmVrbDZUVlJWTkU0eVVtMVpiVkUwVGpKWmVFOVVXbWhPYlVsNlRucGpNRTFIU1RKTlZFRTBXWHBDYUU5VVFYaFphbFV4V1hwV2JGa3lSbXROUjBsNlQxUmplRTVFVG14T2VrSnBUV3BhYkZwWFVUMD06VUhvNVQySXlOV3htUkU1clQxUk5NRnBVUVRKUFJHczFUVEpSZUU0eVZUVmFSRVV6VDBkT2JGbFVTVFJaVjFwdFRsUm5OVTU2U1hoYWFscG9UbnBzYkU1RVFYaGFWRUY1V2tkWk1rNHlSVFJOZW1ocVdWUmFhMDlYVVhoYVJGazl8ZmQyYWVhMzUwMjEyNzUzMTVhZTdmNDIxNzJkZjU0NDk0N2IxNjZmNTViOTkxOTQ0N2FjYTczZmE3OGJlMjliZg=='],
'filters' : "",
'query' : "%s" % keyword,
'scope' : "pins"
},
"context" : {}
})
}
def parse_response_data(self,response):
resp = response.json()['resource_response']
if resp.get('error'):
raise Exception(str(response) + resp.get('error'))
# This is the results after parse
self.parsed_data = [ pic.get('images')['orig']['url'] for pic in resp.get('data',[]) ]
def call_request(self):
return self._get(self.api_url,params=self.params)
''' Sample a Pinterest reqwuest params
:bookmarks : a string you can get from catching the request from "/resource/SearchResourceBase/get/"
in the ChomeDeveloperTools (find it in XHR section when you enter a search on the web). The one i use
is hard-cored into request because it work :v . (TESTED) 26 Jan 2018
{"options":{"bookmarks":["Y2JVSG81V2sxcmNHRlpWM1J5VFVad1YxWlVSbGhXTVZwSlZGWlZNVlV3TVZkalJFSlhUVzVTVkZWWGN6RldNa3BKVW14S1YxSnNjR2hYVm1ONFpXc3hWMVZ1U2xaaE0wSnpWVzAxUTJWR1pIRlVibVJXVW10d1NGa3dhRWRXVjBWNFUyeFNXbFpGV2pOV01GcExWMWRPUms5V1pGTmhNMEl5Vm1wSmQyVkdUblJXYkdScVVsWmFWMWxzYUVOaFJteFlaRVphYkdKSFVucFhhMXAzVkRGS2RHVkVRbGRXZWtJMFZrZDRXbVF4V2xWUmJGWnBWa1ZhV1ZkV1ZtRmtNVTVIVm14c1lWSlViRlJWYWtwUFRrWmFTR1ZHVGxWTmEzQlhWREZhVjFWc1pFaFVWR3hRWWtaYVNsbHVjRk5pUmtsNFkwVmFWazFxUm5wV1IzaEtaVVprZFZGc1ZtbFNNVXBOVjFaV1ZrMVdaRWRVYmxKT1ZqQmFXRlZ0ZEhkTlJscEZVbXhPYW1GNlZsZFVNVlpYVmtaa1NWRnNSbGRoTVhCSFZGWmFVMVpzY0VkVGJYaFRWa1phU2xaVVNYZGxSbEp6VjJ0YVYyRnNXbGxaYTFwTFVURndXR042VmxSU2EzQXdXVlZWTVdKSFJYZGpTR2hYVFc1U1ZGVnFTa2RXTWs1SFZteGFWMUpyY0ZKV1YzUnJWVEpPYzFWdVVtcFNWWEJ6V1Zod1IyVkdWbk5WYTA1WVlYcEdlVlJWVWtkV1YwWnlZMFpDV21KR1ZqUmFSVnBoVmxVeFJVMVVhR0ZoYTJ0M1ZGVmtVbVZHY0VoU2JURk9aVzF6ZDFkWE1WWk5SVFZGV2tkNFlXRnNSVEZVVmxKYVpWWndWV0pIYUdGaVZtdDRWMnhTYjJGc2NIRmhSM1JhWld4V00xUlZVbk5pUlRWeFZGUk9ZVlpHUlhsVU1HUkxZV3N4TmxGVVFrNVNSVlV3Vkd0U1QySkZNWEZYYldoYVZWUXdPV1pIVW1wWmVrbDZUVlJWTkU0eVVtMVpiVkUwVGpKWmVFOVVXbWhPYlVsNlRucGpNRTFIU1RKTlZFRTBXWHBDYUU5VVFYaFphbFV4V1hwV2JGa3lSbXROUjBsNlQxUmplRTVFVG14T2VrSnBUV3BhYkZwWFVUMD06VUhvNVQySXlOV3htUkU1clQxUk5NRnBVUVRKUFJHczFUVEpSZUU0eVZUVmFSRVV6VDBkT2JGbFVTVFJaVjFwdFRsUm5OVTU2U1hoYWFscG9UbnBzYkU1RVFYaGFWRUY1V2tkWk1rNHlSVFJOZW1ocVdWUmFhMDlYVVhoYVJGazl8ZmQyYWVhMzUwMjEyNzUzMTVhZTdmNDIxNzJkZjU0NDk0N2IxNjZmNTViOTkxOTQ0N2FjYTczZmE3OGJlMjliZg=="],"filters":"","query":"harry potter","scope":"pins"},"context":{}}
'''
"request_identifier":"874307195690",
"resource_data_cache":[...],
"resource":{...},
"client_context":{...},
"resource_response":{
"data":[
{
"domain":"Uploaded by user",
"done_by_me":false,
"requires_advertiser_attribution":false,
"videos":null,
"tracking_params":"CwABAAAADDg3NDMwNzE5NTY5MAA",
"aggregated_pin_data":{
"did_it_data":{
"recommend_scores":[
{
"count":0,
"score":1
},
{
"count":0,
"score":0.5
},
{
"count":0,
"score":0
}
],
"rating":-1,
"user_count":3,
"tags":[
],
"images_count":0,
"recommended_count":3,
"details_count":3,
"type":"aggregateddiditdata"
},
"id":"4793619930918430080",
"aggregated_stats":{
"saves":38841,
"done":3
}
},
"image_signature":"c839d59c4cf008662871ed797ee84357",
"like_count":0,
"images":{
"736x":{
"url":"https://i.pinimg.com/736x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg",
"width":347,
"height":498
},
"474x":{
"url":"https://i.pinimg.com/474x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg",
"width":347,
"height":498
},
"orig":{
"url":"https://s-media-cache-ak0.pinimg.com/originals/c8/39/d5/c839d59c4cf008662871ed797ee84357.jpg",
"width":347,
"height":498
},
"136x136":{
"url":"https://i.pinimg.com/136x136/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg",
"width":136,
"height":136
},
"236x":{
"url":"https://i.pinimg.com/236x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg",
"width":236,
"height":338
}
},
"id":"324259241902931702",
"price_currency":"USD",
"is_promoted":false,
"description_html":"C",
"privacy":"public",
"grid_description":"C",
"comments":{
"bookmark":null,
"data":[
],
"uri":"/v3/pins/324259241902931702/comments/"
},
"access":[
],
"comment_count":0,
"board":{
"is_collaborative":false,
"layout":"default",
"name":"Creative",
"privacy":"public",
"url":"/phamthaominh197/creative/",
"owner":{
"id":"324259379329222479"
},
"followed_by_me":false,
"type":"board",
"id":"324259310610038895",
"image_thumbnail_url":"https://s-media-cache-ak0.pinimg.com/upload/324259310610038895_board_thumbnail_2017-12-23-05-56-05_51593_60.jpg"
},
"type":"pin",
"method":"uploaded",
"attribution":null,
"description":"C",
"price_value":0.0,
"additional_hide_reasons":[
],
"native_creator":null,
"is_playable":false,
"debug_info_html":null,
"ad_match_reason":0,
"link":null,
"has_required_attribution_provider":false,
"view_tags":[
],
"is_repin":true,
"pin360":null,
"liked_by_me":false,
"rich_summary":null,
"is_uploaded":true,
"pinner":{
"username":"phamthaominh197",
"explicitly_followed_by_me":false,
"image_xlarge_url":"https://i.pinimg.com/280x280_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg",
"full_name":"Minh Pham",
"image_small_url":"https://i.pinimg.com/30x30_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg",
"type":"user",
"id":"324259379329222479",
"image_large_url":"https://i.pinimg.com/140x140_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg"
},
"repin_count":0,
"created_at":"Tue, 31 May 2016 07:40:52 +0000",
"is_native":false,
"promoter":null,
"promoted_is_removable":false,
"buyable_product":null,
"dominant_color":"#232b2c",
"title":"",
"embed":null,
"is_quick_promotable":false,
"is_video":false,
"is_downstream_promotion":false
},
...
]
}
}
}
@pedroribeirodev
Copy link

Can you tell me if it works with django?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment