Skip to content

Instantly share code, notes, and snippets.

@tracylemke
Created January 3, 2020 10:13
Show Gist options
  • Save tracylemke/9102e825d8bd99119f2c664a7051c9bc to your computer and use it in GitHub Desktop.
Save tracylemke/9102e825d8bd99119f2c664a7051c9bc to your computer and use it in GitHub Desktop.
Web Scraper in Python: scrapes a table
from lxml import html
from lxml.html.clean import Cleaner
import requests
def remove_duplicates(mylist):
return list(dict.fromkeys(mylist))
cleaner = Cleaner()
cleaner.javascript = True
# Scrape list of games
### -- list of sample data -- ###
# url = "http://localhost/test/data.php"
# path = '//tbody[@id="databody"]//a[@target="_parent"]/text()'
# url = "https://www.mmorpg.com/games-list"
# path = '//div[@class="iside"]//a/text()'
url = 'https://en.wikipedia.org/wiki/List_of_Xbox_games'
path = '//td//a[@class="mw-redirect"]/text()'
page = requests.get(url)
data = page.text
tree = html.fromstring(data)
titles = tree.xpath(path)
print(len(titles))
for game_title in titles:
print(f"'{game_title}'")
genres = tree.xpath('//table[@id="softwarelist"]//tr//td//a/text()')
total_uniques = len(remove_duplicates(genres))
print("There are {} genres but {} are unique " . format(len(genres), len(remove_duplicates(genres))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment