lobstrio/growthhackingfr_scraper.py

## growthhackingfr_scraper.py
"""
GrowthHacking.fr Forum Scraper

This script is used to scrape data from the GrowthHacking.fr forum, specifically from the "Scraping" category.
It retrieves information about forum topics and saves it as CSV data.

Usage:
1. Install the required library using the following command:
   $ pip install requests

2. Run this script using the following command:
   $ python growthhackingfr_scraper.py

Note: Make sure you have Python installed on your system.

Author: Sasha Bouloudnine
Date: 11/08/2023

---

Required Library:
- requests: Used for making HTTP requests to the forum API.

"""

import requests
import time
import csv
import random

FIELDNAMES = [
    'id',
    'title',
    'fancy_title',
    'slug',
    'posts_count',
    'reply_count',
    'highest_post_number',
    'image_url',
    'created_at',
    'last_posted_at',
    'bumped',
    'bumped_at',
    'archetype',
    'unseen',
    'pinned',
    'unpinned',
    'visible',
    'closed',
    'archived',
    'bookmarked',
    'liked',
    'views',
    'like_count',
    'has_summary',
    'last_poster_username',
    'category_id',
    'pinned_globally',
    'featured_link',
  ]

def scrap_growthhackingforum():
  CURL = """curl 'https://www.growthhacking.fr/c/scraping/8/l/latest.json?ascending=false&page=2000' \
    -H 'sec-ch-ua: "Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"' \
    -H 'Discourse-Present: true' \
    -H 'X-CSRF-Token: GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw' \
    -H 'sec-ch-ua-mobile: ?0' \
    -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' \
    -H 'Discourse-Logged-In: true' \
    -H 'Accept: application/json, text/javascript, */*; q=0.01' \
    -H 'Referer: https://www.growthhacking.fr/c/scraping/8' \
    -H 'X-Requested-With: XMLHttpRequest' \
    -H 'sec-ch-ua-platform: "macOS"' \
    --compressed"""

  TOPIC_EXAMPLE = """{
     "id":31591,
     "title":"Scrapper adresse mail avocats Paris ",
     "fancy_title":"Scrapper adresse mail avocats Paris ",
     "slug":"scrapper-adresse-mail-avocats-paris",
     "posts_count":3,
     "reply_count":0,
     "highest_post_number":3,
     "image_url":"None",
     "created_at":"2023-06-12T15:16:36.493Z",
     "last_posted_at":"2023-06-16T02:28:35.774Z",
     "bumped":true,
     "bumped_at":"2023-06-16T02:28:35.774Z",
     "archetype":"regular",
     "unseen":false,
     "pinned":false,
     "unpinned":"None",
     "visible":true,
     "closed":false,
     "archived":false,
     "bookmarked":"None",
     "liked":"None",
     "tags_descriptions":{

     },
     "views":202,
     "like_count":0,
     "has_summary":false,
     "last_poster_username":"Arnaud2017",
     "category_id":8,
     "pinned_globally":false,
     "featured_link":"None",
     "posters":[
        {
           "extras":"None",
           "description":"Créateur du sujet",
           "user_id":54674,
           "primary_group_id":"None",
           "flair_group_id":"None"
        },
        {
           "extras":"None",
           "description":"Auteur fréquent",
           "user_id":52085,
           "primary_group_id":"None",
           "flair_group_id":"None"
        },
        {
           "extras":"latest",
           "description":"Auteur le plus récent",
           "user_id":50548,
           "primary_group_id":"None",
           "flair_group_id":"None"
        }
     ]
  }"""

  s = requests.Session()

  DATA = []

  TIMESTAMP = str(time.time()).replace('.','')

  page = 1
  while True:
    headers = {
      'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
      'Discourse-Present': 'true',
      'X-CSRF-Token': 'GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw',
      'sec-ch-ua-mobile': '?0',
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
      'Discourse-Logged-In': 'true',
      'Accept': 'application/json, text/javascript, */*; q=0.01',
      'Referer': 'https://www.growthhacking.fr/c/scraping/8',
      'X-Requested-With': 'XMLHttpRequest',
      'sec-ch-ua-platform': '"macOS"',
    }

    params = {
        'ascending': 'false',
        'page': page,
    }

    print('> accessing page %s' % page)

    response = requests.get('https://www.growthhacking.fr/c/scraping/8/l/latest.json', params=params, headers=headers)
    assert response.status_code == 200

    j = response.json()
    topics = j['topic_list']['topics']
    if not topics:
      break

    for t in topics:
      d = {}
      for k in FIELDNAMES:
        d[k] = t[k]
      DATA.append(d)

    page += 1

  return DATA

def write_data(DATA):
  print('> writing data')
  TIMESTAMP = str(time.time()).replace('.','')
  with open('results_growthhackingscraping_%s.csv' % TIMESTAMP, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
    writer.writeheader()
    for d in DATA:
      writer.writerow(d)

  print('done/cool')

if __name__ == '__main__':
  DATA = scrap_growthhackingforum()
  assert DATA
  write_data(DATA)
	"""
	GrowthHacking.fr Forum Scraper

	This script is used to scrape data from the GrowthHacking.fr forum, specifically from the "Scraping" category.
	It retrieves information about forum topics and saves it as CSV data.

	Usage:
	1. Install the required library using the following command:
	$ pip install requests

	2. Run this script using the following command:
	$ python growthhackingfr_scraper.py

	Note: Make sure you have Python installed on your system.

	Author: Sasha Bouloudnine
	Date: 11/08/2023

	---

	Required Library:
	- requests: Used for making HTTP requests to the forum API.

	"""

	import requests
	import time
	import csv
	import random

	FIELDNAMES = [
	'id',
	'title',
	'fancy_title',
	'slug',
	'posts_count',
	'reply_count',
	'highest_post_number',
	'image_url',
	'created_at',
	'last_posted_at',
	'bumped',
	'bumped_at',
	'archetype',
	'unseen',
	'pinned',
	'unpinned',
	'visible',
	'closed',
	'archived',
	'bookmarked',
	'liked',
	'views',
	'like_count',
	'has_summary',
	'last_poster_username',
	'category_id',
	'pinned_globally',
	'featured_link',
	]

	def scrap_growthhackingforum():
	CURL = """curl 'https://www.growthhacking.fr/c/scraping/8/l/latest.json?ascending=false&page=2000' \
	-H 'sec-ch-ua: "Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"' \
	-H 'Discourse-Present: true' \
	-H 'X-CSRF-Token: GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw' \
	-H 'sec-ch-ua-mobile: ?0' \
	-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' \
	-H 'Discourse-Logged-In: true' \
	-H 'Accept: application/json, text/javascript, /; q=0.01' \
	-H 'Referer: https://www.growthhacking.fr/c/scraping/8' \
	-H 'X-Requested-With: XMLHttpRequest' \
	-H 'sec-ch-ua-platform: "macOS"' \
	--compressed"""

	TOPIC_EXAMPLE = """{
	"id":31591,
	"title":"Scrapper adresse mail avocats Paris ",
	"fancy_title":"Scrapper adresse mail avocats Paris ",
	"slug":"scrapper-adresse-mail-avocats-paris",
	"posts_count":3,
	"reply_count":0,
	"highest_post_number":3,
	"image_url":"None",
	"created_at":"2023-06-12T15:16:36.493Z",
	"last_posted_at":"2023-06-16T02:28:35.774Z",
	"bumped":true,
	"bumped_at":"2023-06-16T02:28:35.774Z",
	"archetype":"regular",
	"unseen":false,
	"pinned":false,
	"unpinned":"None",
	"visible":true,
	"closed":false,
	"archived":false,
	"bookmarked":"None",
	"liked":"None",
	"tags_descriptions":{

	},
	"views":202,
	"like_count":0,
	"has_summary":false,
	"last_poster_username":"Arnaud2017",
	"category_id":8,
	"pinned_globally":false,
	"featured_link":"None",
	"posters":[
	{
	"extras":"None",
	"description":"Créateur du sujet",
	"user_id":54674,
	"primary_group_id":"None",
	"flair_group_id":"None"
	},
	{
	"extras":"None",
	"description":"Auteur fréquent",
	"user_id":52085,
	"primary_group_id":"None",
	"flair_group_id":"None"
	},
	{
	"extras":"latest",
	"description":"Auteur le plus récent",
	"user_id":50548,
	"primary_group_id":"None",
	"flair_group_id":"None"
	}
	]
	}"""

	s = requests.Session()

	DATA = []

	TIMESTAMP = str(time.time()).replace('.','')

	page = 1
	while True:
	headers = {
	'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
	'Discourse-Present': 'true',
	'X-CSRF-Token': 'GE3UrIV9vAoQodpEWcjAnl-zDWKL7XfLD4NrTqvZBiU4XqqFAf2s9-a3e0HFTh9c4Vsu_G9B5uHTAJbQZ4ymTw',
	'sec-ch-ua-mobile': '?0',
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
	'Discourse-Logged-In': 'true',
	'Accept': 'application/json, text/javascript, /; q=0.01',
	'Referer': 'https://www.growthhacking.fr/c/scraping/8',
	'X-Requested-With': 'XMLHttpRequest',
	'sec-ch-ua-platform': '"macOS"',
	}

	params = {
	'ascending': 'false',
	'page': page,
	}

	print('> accessing page %s' % page)

	response = requests.get('https://www.growthhacking.fr/c/scraping/8/l/latest.json', params=params, headers=headers)
	assert response.status_code == 200

	j = response.json()
	topics = j['topic_list']['topics']
	if not topics:
	break

	for t in topics:
	d = {}
	for k in FIELDNAMES:
	d[k] = t[k]
	DATA.append(d)

	page += 1

	return DATA

	def write_data(DATA):
	print('> writing data')
	TIMESTAMP = str(time.time()).replace('.','')
	with open('results_growthhackingscraping_%s.csv' % TIMESTAMP, 'w') as f:
	writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
	writer.writeheader()
	for d in DATA:
	writer.writerow(d)

	print('done/cool')

	if __name__ == '__main__':
	DATA = scrap_growthhackingforum()
	assert DATA
	write_data(DATA)