Skip to content

Instantly share code, notes, and snippets.

View triposat's full-sized avatar

Satyam Tripathi triposat

View GitHub Profile
@triposat
triposat / amazon_asins_proxies.py
Created November 26, 2024 06:33
Scrape Amazon ASINs with Residential Proxies + Custom Headers
import asyncio
import os
import random
import time
from dataclasses import dataclass
from typing import List, Optional, Set
import pandas as pd
from bs4 import BeautifulSoup
from curl_cffi import requests
@triposat
triposat / amazon_asins.py
Created November 26, 2024 06:12
Scrape Amazon ASINs
import asyncio
import os
from curl_cffi import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random
class AsinScraper:
def __init__(self):
self.session = requests.Session()
@triposat
triposat / amazon_keywords_data.json
Created November 25, 2024 11:47
Bright Data Amazon Scraper API Data
[
{
"asin": "B09MRFNDZ6",
"url": "https://www.amazon.com/X-Box-Gaming-Console-Bundle-Controllers/dp/B09MRFNDZ6/ref=sr_1_1",
"name": "X-Box Series X Gaming Console Bundle - 1TB SSD Black X-Box Console with Two Wireless Controllers -Black and White -and ahaghug Authorized HDMI Cable X-Box Series X Gaming Console Bundle - 1TB SSD Black X-Box Console with Two Wireless Controllers -Black and White -and ahaghug Authorized HDMI Cable",
"sponsored": "false",
"initial_price": 604.49,
"final_price": 558,
"currency": "USD",
"sold": 300,
@triposat
triposat / hotel_analysis_dashboard.py
Created November 8, 2024 11:19
Google Maps Scrape: Hotel Analysis Dashboard - Crawlee for Python
import streamlit as st
import pandas as pd
import plotly.express as px
import json
import re
from pathlib import Path
# Setup dark theme dashboard
st.set_page_config(
page_title="Hotel Analysis Dashboard",
@triposat
triposat / google_maps_scraper_proxies.py
Last active December 23, 2024 17:06
Google Maps Scraper - Using Proxies - Crawlee for Python
import asyncio
from datetime import timedelta
from typing import Dict, Optional, Set
from crawlee.playwright_crawler import PlaywrightCrawler
from crawlee.proxy_configuration import ProxyConfiguration
from playwright.async_api import Page, ElementHandle
class GoogleMapsScraper:
"""
@triposat
triposat / google_maps_scraper.py
Last active December 23, 2024 17:04
Google Maps Scraper - Crawlee for Python
import asyncio
from datetime import timedelta
from typing import Dict, Optional, Set
from crawlee.playwright_crawler import PlaywrightCrawler
from playwright.async_api import Page, ElementHandle
class GoogleMapsScraper:
"""
Scraper for extracting business listing data from Google Maps.
@triposat
triposat / google_flights_scraper.py
Created October 28, 2024 04:34
Google Flights Scraper - Bright Data
from playwright.async_api import async_playwright
from tenacity import retry, stop_after_attempt, wait_fixed
from dataclasses import dataclass
from typing import List, Optional
from datetime import datetime
import asyncio
import json
import os
@triposat
triposat / walmart_product_data.json
Created October 23, 2024 08:05
Scraped Walmart Product Data using Massive Proxies
[
{
"title": "14.1in Windows 11 Pro Laptop, 8GB DDR4, 512GB SSD Computer, Intel Celeron, 1920x1080, 1TB Expansion, Silver",
"product_url": "https://www.walmart.com/ip/Temlicolo-14-1-Laptop-8GB-RAM-PC-512GB-SSD-Intel-Celeron-N4020C-up-to-2-8GHz-Windows-11-Pro-Webcam-1TB-SSD-Expansion-Silver/1519228026?classType=VARIANT&selectedSellerId=101196098&from=/search",
"current_price": "Now$22789",
"previous_price": "$499.99",
"rating": "4.5",
"num_reviews": "220",
"shipping_info": "Free shipping, arrives in 2 days"
},
@triposat
triposat / scrape_walmart_massive_proxies.py
Created October 23, 2024 07:47
How to Scrape Walmart Data Using Massive Proxies
import asyncio
import random
import json
from playwright.async_api import async_playwright
async def scroll_and_extract(page):
previous_height = await page.evaluate("document.body.scrollHeight")
while True:
network_conditions = await page.evaluate(
@triposat
triposat / google_images_scraper.py
Created October 13, 2024 17:57
Google Images Scraper - ScrapingAnt
import asyncio
import json
import os
import shutil
from aiohttp import ClientSession, ClientTimeout
from urllib.parse import urlparse, urlencode
from playwright.async_api import async_playwright
# Function to extract the domain from a URL
def extract_domain(url):