Created
January 26, 2019 18:30
-
-
Save rhanka/b119417b51e2cf1a1a36d1d5ecb5fa14 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE | |
# -*- coding: utf-8 -*- | |
import dataiku | |
import pandas as pd, numpy as np | |
from dataiku import pandasutils as pdu | |
import hashlib | |
import re | |
import json | |
import requests | |
from pandas.io.json import json_normalize | |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE | |
limit = 100 | |
maxtries = 3 | |
text_query = 'my_query' | |
category = '2' | |
api_key = 'my_api_key' # you can get a free key with a inspect on firefox going on leboncoin.fr | |
url = 'https://api.leboncoin.fr/finder/search' | |
data = {"limit":limit,"limit_alu":3,"filters":{"category":{"id":category},"enums":{"ad_type":["offer"]},"location":{},"keywords":{"text": text_query},"ranges":{}}} | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', | |
'api_key': api_key, | |
'origin': 'https://www.leboncoin.fr', | |
'Referer': 'https://www.leboncoin.fr/voitures/offres/' | |
} | |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE | |
tries=1 | |
failed=True | |
while ((failed == True) & (tries <= maxtries)): | |
try: | |
response = requests.post(url, headers=headers, json=data) | |
status_code = response.status_code | |
except requests.exceptions.ReadTimeout: | |
status_code = "timeout" | |
if status_code == 200: | |
failed=False | |
else: | |
#logging.warning("{}".format(tries)) | |
tries += 1 | |
if (tries <= maxtries): | |
time.sleep(3 ** (tries-1)) | |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE | |
df=pd.DataFrame(json.loads(response.content)['ads']) | |
print df.head | |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE | |
cols = ['attributes', 'images', 'location', 'options', 'owner'] | |
for col in cols: | |
df[col]=df[col].apply(lambda x: str(json.dumps(x))) | |
print df.head | |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE | |
# Write recipe outputs | |
leboncoin_scrap = dataiku.Dataset("leboncoin_scrap") | |
leboncoin_scrap.write_with_schema(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment