Skip to content

Instantly share code, notes, and snippets.

@sairoopb
Last active May 21, 2019 12:16
Show Gist options
  • Save sairoopb/55d219aa87ca65f8ddfb41cf23786596 to your computer and use it in GitHub Desktop.
Save sairoopb/55d219aa87ca65f8ddfb41cf23786596 to your computer and use it in GitHub Desktop.
import csv
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import json
from requests.exceptions import InvalidURL,TooManyRedirects
from requests import ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError
def get_html_array(js_url):
response = requests.get(js_url)
start = response.text.find("var html = [") + len("var html = [") - 1
end = response.text.find(".join(")
raw_js_array = response.text[start:end]
python_list = json.loads(raw_js_array)
return python_list
def url_get(var):
soup = BeautifulSoup(var,'html.parser')
list_urls = []
for url in soup.find_all("a") :
if "www.crunchbase.com" not in url.get('href',):
list_urls.append(url.get('href',))
return list_urls
html_array = get_html_array("https://connect.techstars.com/widgets/portfolio-statistics.js")
html_final = ("\n".join(html_array))
list_of_url = []
list_of_url = url_get(html_final)
def app_get(comp_url):
headers = {
'authority': 'www.crunchbase.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'dnt': '1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cookie': '__cfduid=d499acc665a0982639cca9a26bdf370161557839726; _pxhd=52dfd28c320c0e311f69e0cb66953de8b765de55690c734ae52e52a1e6de5c06:56caf5b1-764a-11e9-8c30-3322ba1ff06d; cid=rBsWdlzav25iOAAkC8GtAg==; _ga=GA1.2.552015480.1557839702; _fbp=fb.1.1557839702219.416251075; __qca=P0-1160423674-1557839702197; __zlcmid=sIiEfBGcEd0oVD; fs_uid=rs.fullstory.com`BA8KZ`5353594564706304:5992337998938112; _gid=GA1.2.211578024.1558344556; _hp2_ses_props.973801186=%7B%22ts%22%3A1558351433747%2C%22d%22%3A%22www.crunchbase.com%22%2C%22h%22%3A%22%2F%22%7D; _mkto_trk=id:976-JJA-800&token:_mch-crunchbase.com-1558351439685-49508; remember_user_token=W1syNTM0MTY1XSwiWkZ3QXJHVjlUTV9HNHlubXNyN3AiLCIxNTU4MzUxODY0LjA3NTk0Il0%3D--fdcd15d5665029f749f43e973e421cdffe582a78; XSRF-TOKEN=zd7iIY3d6hgMqNBLtWPTnu2szyxH9QpgEi9sDwZiKs94O6%2B7K1aq3w1VpnvyLsCHvpLGJMudJqbj1llsv24N4w%3D%3D; _site_session=98f381c251275771207a87ccefc1f803; _gat_UA-60854465-1=1; _pxff_tm=1; _px3=649f2e4854a0958d3d3ff7fbcff4afc68eee70afa8853da5f4122faa108cfa49:8uLj9mg0hxM5QN09s3XkOZBD3cb8XrbNN2d6wCkdXN75YLqa5IgqzJwLwHHhrNs24Awvc6AtAQ1uDcGEKEzs/w==:1000:m2BFWfCzyT7cTEjySs+37sJcQlGalfOBNfqwbi8NF/ZyZl1y3kN13BOwqRfO5agy8uwKgRXvU0llAhkuWJ6Yhpmn0hH/MisEA6/j3ZmSsot2onYwnGui+4k2c0B/LxelB5fYcjb4AsdkSf0Ehd16McaOuso01PkUH+oWpsCz524=; _hp2_props.973801186=%7B%22Logged%20In%22%3Atrue%2C%22Pro%22%3Afalse%2C%22apptopia-lite%22%3Atrue%2C%22similarweb%22%3Atrue%2C%22owler%22%3Atrue%7D; _pendo_visitorId.c2d5ec20-6f43-454d-5214-f0bb69852048=c107a956-df02-4e2e-8467-f58a854df062; _pendo_meta.c2d5ec20-6f43-454d-5214-f0bb69852048=539547481; _hp2_id.973801186=%7B%22userId%22%3A%227667545879524903%22%2C%22pageviewId%22%3A%225492701109201534%22%2C%22sessionId%22%3A%226247381939942526%22%2C%22identity%22%3A%22john.andrews%40mailinnator.com%22%2C%22trackerVersion%22%3A%224.0%22%2C%22identityField%22%3Anull%2C%22isIdentified%22%3A1%7D',
}
try:
response = requests.get(comp_url, headers=headers,timeout = 2)
soup = BeautifulSoup(response.text,'html.parser')
for link in soup.select("a"):
r = link.get("href","")
if "play.google.com" in r:
return r
return 'NULL'
except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, InvalidURL, ConnectionError):
return "NULL"
def check(url):
if url != "":
if "https://" not in url:
if "http://" not in url:
url = "http://" + url
return url
else:
return url
else:
return url
def remove_values_from_list(the_list, val):
while val in the_list:
the_list.remove(val)
list_of_url_up = list(map(check,list_of_url))
remove_values_from_list(list_of_url_up,None)
with ThreadPoolExecutor(max_workers=10) as executor:
list_of_playstore_links = list(executor.map(app_get,list_of_url_up))
list_of_playstore_links.insert(0,"Playstore Link")
list_of_url_up.insert(0,"Company Website")
info = dict(zip(list_of_url_up,list_of_playstore_links))
with open('info.csv', 'w+') as csv_file:
writer = csv.writer(csv_file)
for key, value in info.items():
writer.writerow([key, value])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment