Skip to content

Instantly share code, notes, and snippets.

@south1907
Created June 19, 2022 07:35
Show Gist options
  • Save south1907/85b054334a96f122ad9d311678bfd22f to your computer and use it in GitHub Desktop.
Save south1907/85b054334a96f122ad9d311678bfd22f to your computer and use it in GitHub Desktop.
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl
import xlsxwriter
def get_row_data(link_detail):
payload={}
headers = {
'authority': 'www.producthunt.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'first_visit=1654929088; first_referer=; ajs_anonymous_id=%22628897fa-3abb-4142-bc97-5cd8498b34d0%22; _delighted_web={%2271AaKmxD4TpPsjYW%22:{%22_delighted_fst%22:{%22t%22:%221654929088226%22}}}; _ga=GA1.2.1194507260.1654929088; _gid=GA1.2.1865172956.1654929088; visitor_id=79f8abb7-0d16-4544-b8cd-cd17ba70011d; track_code=8bc55d00b2; g_state={"i_p":1654936304525,"i_l":1}; csrf_token=ZQwohUveJ5pQwUtACpEyeMIb8yrRNb0omt7ribwddN58IaRx%2Fuj1PwpOpcAFsLTiiY7JfpcXStLqGRfumURwBA%3D%3D; _producthunt_session_production=kj5xAcTy%2FmoPvSGXsd9UmPtrC%2FybCFW61K1tqUGjjKafJw8G9YheayzD3V8aK2iOPErZtQ9cpsWHo755EKcqTzcXObpTmRXRsiS0zNYTFLMzq4dEJUukzusX408M7E4AN8XceFf85%2FiHO7JlucuQ7cRCIwIAlWqjUOSCYId4qbVpAwFWsix%2FkQJcIzj1cr2z9LF6XGzpE68nfyv3%2BG9ockMl%2B8Mib9u6u7qIAVcIt9AAOuFlSDnnbTxXfD9AQvIhMmOLhE%2Fv9UDTT9bEq%2Bb4maqe3iOt71hj1FMAYzShlqq9h57vq46lfp9MTQGVg%2FRoPHaziz1QVEwWz%2BsaZnwhlMiF7qCFdJSYadt7ptFpGHPVJfYsmmgipw0v%2F%2ByPhOCDVhFsUS2yKzFRVu9D7rjqaL4%2FSNNBczF%2FDpTVBJIHrqcOcyt%2FWN4RuFThgIxCYAwgLay%2FtTxpYWfvOGbSKFFnV2WEPWoLJYZOE7OmGA0%3D--WNYtOdTtEikObi19--tmXOs9r71CxPF6Y%2BddojTQ%3D%3D; _producthunt_session_production=UVTrsQELl53GdfwBanr%2B9qVeZ6xSiS44TtBaYGR6ve8rxhlvOB9qUO58ZcrYGdopr7Mb5gLgxDLcxIbJkRBfrR1vVYGuPLJXWMMdshYRlpGWXYypyy8yiytU3oY7Fz6LsdgTiqOCOUqjkF6nSRpXZtswNPWbHo3GC3lIDG0eIqc5tMnXWQOQ5sPMr55%2BwejtvmL6EZXin%2B5zar62ZRmZLFvYQqt8k83dHCf2C15GCPD6ONwlJOuSw%2B5ZrsV3vZP3MkYRGdaNQIC4gp6W5ZBSj0SSRbXX7zPtppdYVsUHb%2FwgNj67SfSeRolCPl94Wa%2BOWHFQz%2BquIIChV17cHXxK1%2BttSOYL8JcVFdFhW59GKlorU7RVrA%3D%3D--dbyTmU4YCrNTGqff--lq0LMN%2BR8ZFFEG4bYazTrQ%3D%3D; csrf_token=61q0Wbq0bU0RIWWKlZ0EUcAsI3p6f%2BwQBVEWpDB4FSTydzitD4K%2F6EuuiwqavILLi7kZLjxdG%2Bp1lurDFSER%2Fg%3D%3D',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36'
}
response = requests.request("GET", link_detail, headers=headers, data=payload)
text_html = response.text
soup = BeautifulSoup(text_html, 'html.parser')
find_infos = soup.find_all('div', {'class':'style_flex___KlcI style_direction-row__oinjH style_flex-row-gap-4__izJPT style_mt-4__uHhPT'})
# print(find_infos)
find_infos = find_infos[0]
infos = find_infos.find_all('div', {'class': 'style_flex___KlcI style_direction-row__oinjH style_flex-row-gap-1__VY472'})
data_file = []
result = []
if len(infos) == 3:
for info in infos:
result.append(info.find('div').getText())
else:
result.extend(['', '', ''])
# link website
find_website = soup.find('a', {'class':'styles_reset__opz7w styles_button__zKntg styles_secondary__aa4sx'})
if find_website:
result.append(find_website['href'])
else:
result.append('')
find_description = soup.find_all('div', {'class': 'style_color-dark-grey__aN5DV style_fontSize-16__DCrgA style_fontWeight-400__5p97M'})
if find_description and len(find_description) > 0:
result.append(find_description[-1].getText())
else:
result.append('')
return result
def get_page_by_cursor(cursor):
url = "https://www.producthunt.com/frontend/graphql"
payload = json.dumps({
"operationName": "TopicPage",
"variables": {
"slug": "productivity",
"order": "most-upvoted",
"cursor": cursor,
"query": None,
"topPostsVariant": "THIS_WEEK",
"includeLayout": False
},
"query": "query TopicPage($slug:String!$cursor:String$query:String$subtopic:ID$order:String$topPostsVariant:TopPostsCardVariant!){topic(slug:$slug){id slug parent{id name slug __typename}...MetaTags ...TopicPageHeaderFragment ...TopicPagePostListFragment relatedAd(kind:\"feed\"){...AdFragment __typename}relatedTopics(limit:3){id ...RelatedTopicsSidebarCardFragment __typename}__typename}stories(first:3 order:TRENDING){edges{node{id ...StoriesSidebarCardFragment __typename}__typename}__typename}...TopPostsSidebarCardFragment}fragment TopPostsSidebarCardFragment on Query{postsTop(preferredVariant:$topPostsVariant){variant posts{id name slug tagline ...PostThumbnail __typename}__typename}__typename}fragment PostThumbnail on Post{id name thumbnailImageUuid ...PostStatusIcons __typename}fragment PostStatusIcons on Post{id name productState __typename}fragment StoriesSidebarCardFragment on AnthologiesStory{id slug title headerImageUuid minsToRead __typename}fragment RelatedTopicsSidebarCardFragment on Topic{id slug name imageUuid description __typename}fragment MetaTags on SEOInterface{id meta{canonicalUrl creator description image mobileAppUrl oembedUrl robots title type author authorUrl __typename}__typename}fragment AdFragment on AdChannel{id post{id slug name updatedAt commentsCount ...PostVoteButtonFragment __typename}ctaText dealText name tagline thumbnailUuid url __typename}fragment PostVoteButtonFragment on Post{id featuredAt updatedAt createdAt disabledWhenScheduled hasVoted ...on Votable{id votesCount __typename}__typename}fragment TopicPageHeaderFragment on Topic{id name description parent{id name slug __typename}...TopicFollowButton ...FacebookShareButtonFragment topPosts:posts(first:3 order:\"most-upvoted\"){edges{node{id name slug ...PostThumbnail __typename}__typename}__typename}__typename}fragment TopicFollowButton on Topic{id slug name isFollowed followersCount ...TopicImage __typename}fragment TopicImage on Topic{name imageUuid __typename}fragment FacebookShareButtonFragment on Shareable{id url __typename}fragment TopicPagePostListFragment on Topic{name slug posts(first:20 after:$cursor query:$query subtopic:$subtopic order:$order){edges{node{id ...PostItem ...TopicPageReviewRatingFragment __typename}__typename}pageInfo{endCursor hasNextPage __typename}__typename}__typename}fragment PostItem on Post{id commentsCount name shortenedUrl slug tagline updatedAt pricingType topics(first:1){edges{node{id name slug __typename}__typename}__typename}redirectToProduct{id slug __typename}...PostThumbnail ...PostVoteButton __typename}fragment PostVoteButton on Post{id featuredAt updatedAt createdAt disabledWhenScheduled hasVoted ...on Votable{id votesCount __typename}__typename}fragment TopicPageReviewRatingFragment on Post{id reviewsWithBodyCount product{id slug __typename}__typename}"
})
headers = {
'authority': 'www.producthunt.com',
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'content-type': 'application/json',
'cookie': 'first_visit=1654929088; first_referer=; ajs_anonymous_id=%22628897fa-3abb-4142-bc97-5cd8498b34d0%22; _delighted_web={%2271AaKmxD4TpPsjYW%22:{%22_delighted_fst%22:{%22t%22:%221654929088226%22}}}; _ga=GA1.2.1194507260.1654929088; _gid=GA1.2.1865172956.1654929088; visitor_id=79f8abb7-0d16-4544-b8cd-cd17ba70011d; track_code=8bc55d00b2; g_state={"i_p":1654936304525,"i_l":1}; _gat=1; csrf_token=fD5%2BecIhhGYELtY2m4PPz0X6Jk7kLlBIYgrbwIwDoKVlE%2FKNdxdWw16hOLaUoklVDm8cGqIMp7ISzSenqVqkfw%3D%3D; _producthunt_session_production=CvQI14XphpKK46l3AB7ixRPz4CpCiM4NLPUW%2B2xKSnxK6jFYiB%2FTxlt%2BUdGa6wcsfouFAwJC2GJDomq1ljkmI0vA1FEVC9YzLvHhcvCP41fEcRwmjipl%2F22h6DdUMTWpWppyS7L8qU4mNJ8GNDbX5waAHB7JzHItkdjZeyNV4qAdwZgJKQa9WU0JXyJ2%2FhKBjy16ylp9xjwaatsTM7aCbae%2BF9ziWFUO3nWywoOOUVdY5HYfktorH0OqHTCsYIw1%2BK8KP50ZNh8TleykOO59J6QyRzISPwVO1KB22SfQY%2BGE6CZeqU106wWRBH9feAOWWy5fedaS%2BY%2FF0r9m458IUmai9DmNYIwnFun5HU0%3D--%2FAzoSXAwPi9f7aho--JivoTF0rZbF2cL84JxTs5w%3D%3D; _producthunt_session_production=UVTrsQELl53GdfwBanr%2B9qVeZ6xSiS44TtBaYGR6ve8rxhlvOB9qUO58ZcrYGdopr7Mb5gLgxDLcxIbJkRBfrR1vVYGuPLJXWMMdshYRlpGWXYypyy8yiytU3oY7Fz6LsdgTiqOCOUqjkF6nSRpXZtswNPWbHo3GC3lIDG0eIqc5tMnXWQOQ5sPMr55%2BwejtvmL6EZXin%2B5zar62ZRmZLFvYQqt8k83dHCf2C15GCPD6ONwlJOuSw%2B5ZrsV3vZP3MkYRGdaNQIC4gp6W5ZBSj0SSRbXX7zPtppdYVsUHb%2FwgNj67SfSeRolCPl94Wa%2BOWHFQz%2BquIIChV17cHXxK1%2BttSOYL8JcVFdFhW59GKlorU7RVrA%3D%3D--dbyTmU4YCrNTGqff--lq0LMN%2BR8ZFFEG4bYazTrQ%3D%3D; csrf_token=61q0Wbq0bU0RIWWKlZ0EUcAsI3p6f%2BwQBVEWpDB4FSTydzitD4K%2F6EuuiwqavILLi7kZLjxdG%2Bp1lurDFSER%2Fg%3D%3D',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjE4NjQxMTMiLCJhcCI6IjU5NDMzNzgyMiIsImlkIjoiM2RlNGZiYTgxMTQwYjBlMyIsInRyIjoiMTdlYTQ3YmNhYzU2MTA0OWIxZWUwOTdmZGRiZDA4ZDciLCJ0aSI6MTY1NDkzODIyMTQyMn19',
'origin': 'https://www.producthunt.com',
'referer': 'https://www.producthunt.com/topics/productivity',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-17ea47bcac561049b1ee097fddbd08d7-3de4fba81140b0e3-01',
'tracestate': '1864113@nr=0-1-1864113-594337822-3de4fba81140b0e3----1654938221422',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
response = requests.request("POST", url, headers=headers, data=payload)
return json.loads(response.text)
data_file = []
current_cursor = ''
for i in range(0,10000):
print('page: ' + str(i))
data = get_page_by_cursor(current_cursor)
topics = data['data']['topic']['posts']
edges = topics['edges']
page_info = topics['pageInfo']
has_next = page_info['hasNextPage']
if has_next == False:
print('break by Done page')
break
current_cursor = page_info['endCursor']
print(page_info)
for edge in edges:
edge = edge['node']
try:
if 'product' not in edge or edge['product'] is None:
continue
row = [edge['name'], edge['product']['slug'], edge['tagline'], edge['votesCount']]
link_detail = 'https://www.producthunt.com/products/' + edge['product']['slug']
print(link_detail)
row.append(link_detail)
row.extend(get_row_data(link_detail))
data_file.append(row)
except Exception as e:
print(edge)
# raise
# break
df = pd.DataFrame(data_file, columns=['name', 'slug', 'tagline', 'votesCount', 'link_origin', 'upvotes', 'launches', 'followers', 'website', 'bio'])
df.to_excel('result.xlsx', engine='xlsxwriter')
df = pd.DataFrame(data_file, columns=['name', 'slug', 'tagline', 'votesCount', 'link_origin', 'upvotes', 'launches', 'followers', 'website', 'bio'])
df.to_excel('result.xlsx', engine='xlsxwriter')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment