Skip to content

Instantly share code, notes, and snippets.

@jmftrindade
Created March 28, 2019 19:54
Show Gist options
  • Save jmftrindade/8c01a4b7b6729813de0e019f60a5f631 to your computer and use it in GitHub Desktop.
Save jmftrindade/8c01a4b7b6729813de0e019f60a5f631 to your computer and use it in GitHub Desktop.
twitter scraper

From article https://readypipe.com/blog/yang-buttigieg/.

from readypipe import requests, starting_task, subtask, schedule, save, get_attempts

URLS = {
    'Andrew Yang': 'https://twitter.com/andrewyang',
    'Bernie Sanders': 'https://twitter.com/BernieSanders',
    'Elizabeth Warren': 'https://twitter.com/ewarren',
    'Cory Booker': 'https://twitter.com/CoryBooker',
    'Kamala Harris': 'https://twitter.com/KamalaHarris',
    'Pete Buttigieg': 'https://twitter.com/PeteButtigieg',
    'Julian Castro': 'https://twitter.com/JulianCastro',
    'John Delaney': 'https://twitter.com/JohnDelaney',
    'Tulsi Gabbard': 'https://twitter.com/TulsiGabbard',
    'Kirsten Gillibrand': 'https://twitter.com/SenGillibrand',
    'John Hickenlooper': 'https://twitter.com/Hickenlooper',
    'Jay Inslee': 'https://twitter.com/JayInslee', 
    'Amy Klobuchar': 'https://twitter.com/amyklobuchar',
    'Marianne Williamson': 'https://twitter.com/marwilliamson',
    'Stacey Abrams': 'https://twitter.com/staceyabrams',
    'Michael Bennet': 'https://twitter.com/SenatorBennet',
    'Joe Biden': 'https://twitter.com/JoeBiden',
    'Steve Bullock': 'https://twitter.com/GovernorBullock',
    'Andrew Cuomo': 'https://twitter.com/NYGovCuomo',
    'Bill de Blasio': 'https://twitter.com/BilldeBlasio',
    'Terry McAuliffe': 'https://twitter.com/TerryMcAuliffe',
    'Seth Moulton': 'https://twitter.com/sethmoulton',
    'Beto O\'Rourke': 'https://twitter.com/betoorourke',
    'Eric Swalwell': 'https://twitter.com/RepSwalwell',
    'Mike Gravel': 'https://twitter.com/MikeGravel',
}

@starting_task
def load_urls():
    for candidate in URLS.keys():
        schedule('scrape_candidate', (candidate, URLS[candidate]))

@subtask
def scrape_candidate(candidate, candidate_url):
    MAX_ATTEMPTS = 5 #only retry 5 times
    if get_attempts() >= MAX_ATTEMPTS:
        return
    
    page = requests.get_dom_from_content(candidate_url)
    followers = page.xpath('//*/li[@class="ProfileNav-item ProfileNav-item--followers"]')[0]    
    span = followers.xpath('descendant::span[@class="ProfileNav-value"]')[0]
    _follower_count = span.attrib['data-count']
    save('dem_candidates', {
        'candidate': candidate,
        'candidate_url': candidate_url,
        'followers_string': _follower_count,
        'followers': int(_follower_count),
    })
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment