Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Created August 11, 2023 17:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lobstrio/9ca72017c01e27323007a9e9b37a2537 to your computer and use it in GitHub Desktop.
Save lobstrio/9ca72017c01e27323007a9e9b37a2537 to your computer and use it in GitHub Desktop.
🧙 Scrape all topics from the famous French GrowthHacking.fr forum — 'scraping' category only!
"""
GrowthHacking.fr Forum Scraper
This script is used to scrape data from the GrowthHacking.fr forum, specifically from the "Scraping" category.
It retrieves information about forum topics and saves it as CSV data.
Usage:
1. Install the required library using the following command:
$ pip install requests
2. Run this script using the following command:
$ python growthhackingfr_scraper.py
Note: Make sure you have Python installed on your system.
Author: Sasha Bouloudnine
Date: 11/08/2023
---
Required Library:
- requests: Used for making HTTP requests to the forum API.
"""
import requests
import time
import csv
import random
FIELDNAMES = [
'id',
'title',
'fancy_title',
'slug',
'posts_count',
'reply_count',
'highest_post_number',
'image_url',
'created_at',
'last_posted_at',
'bumped',
'bumped_at',
'archetype',
'unseen',
'pinned',
'unpinned',
'visible',
'closed',
'archived',
'bookmarked',
'liked',
'views',
'like_count',
'has_summary',
'last_poster_username',
'category_id',
'pinned_globally',
'featured_link',
]
def scrap_growthhackingforum():
CURL = """curl 'https://www.growthhacking.fr/c/scraping/8/l/latest.json?ascending=false&page=2000' \
-H 'sec-ch-ua: "Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"' \
-H 'Discourse-Present: true' \
-H 'X-CSRF-Token: GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' \
-H 'Discourse-Logged-In: true' \
-H 'Accept: application/json, text/javascript, */*; q=0.01' \
-H 'Referer: https://www.growthhacking.fr/c/scraping/8' \
-H 'X-Requested-With: XMLHttpRequest' \
-H 'sec-ch-ua-platform: "macOS"' \
--compressed"""
TOPIC_EXAMPLE = """{
"id":31591,
"title":"Scrapper adresse mail avocats Paris ",
"fancy_title":"Scrapper adresse mail avocats Paris ",
"slug":"scrapper-adresse-mail-avocats-paris",
"posts_count":3,
"reply_count":0,
"highest_post_number":3,
"image_url":"None",
"created_at":"2023-06-12T15:16:36.493Z",
"last_posted_at":"2023-06-16T02:28:35.774Z",
"bumped":true,
"bumped_at":"2023-06-16T02:28:35.774Z",
"archetype":"regular",
"unseen":false,
"pinned":false,
"unpinned":"None",
"visible":true,
"closed":false,
"archived":false,
"bookmarked":"None",
"liked":"None",
"tags_descriptions":{
},
"views":202,
"like_count":0,
"has_summary":false,
"last_poster_username":"Arnaud2017",
"category_id":8,
"pinned_globally":false,
"featured_link":"None",
"posters":[
{
"extras":"None",
"description":"Créateur du sujet",
"user_id":54674,
"primary_group_id":"None",
"flair_group_id":"None"
},
{
"extras":"None",
"description":"Auteur fréquent",
"user_id":52085,
"primary_group_id":"None",
"flair_group_id":"None"
},
{
"extras":"latest",
"description":"Auteur le plus récent",
"user_id":50548,
"primary_group_id":"None",
"flair_group_id":"None"
}
]
}"""
s = requests.Session()
DATA = []
TIMESTAMP = str(time.time()).replace('.','')
page = 1
while True:
headers = {
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'Discourse-Present': 'true',
'X-CSRF-Token': 'GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Discourse-Logged-In': 'true',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.growthhacking.fr/c/scraping/8',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-platform': '"macOS"',
}
params = {
'ascending': 'false',
'page': page,
}
print('> accessing page %s' % page)
response = requests.get('https://www.growthhacking.fr/c/scraping/8/l/latest.json', params=params, headers=headers)
assert response.status_code == 200
j = response.json()
topics = j['topic_list']['topics']
if not topics:
break
for t in topics:
d = {}
for k in FIELDNAMES:
d[k] = t[k]
DATA.append(d)
page += 1
return DATA
def write_data(DATA):
print('> writing data')
TIMESTAMP = str(time.time()).replace('.','')
with open('results_growthhackingscraping_%s.csv' % TIMESTAMP, 'w') as f:
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
writer.writeheader()
for d in DATA:
writer.writerow(d)
print('done/cool')
if __name__ == '__main__':
DATA = scrap_growthhackingforum()
assert DATA
write_data(DATA)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment