Skip to content

Instantly share code, notes, and snippets.

@sofianhw
Created October 8, 2015 07:30
Show Gist options
  • Save sofianhw/56f317c636f2892e68c9 to your computer and use it in GitHub Desktop.
Save sofianhw/56f317c636f2892e68c9 to your computer and use it in GitHub Desktop.
BS4ReadGzip
import sys
import re
import io
import gzip
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
def getLinks(pageUrl):
bs = urlopen(Request(pageUrl, headers={"User-Agent": "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11", "Accept-Encoding": "gzip"}) ).read()
bi = io.BytesIO(bs)
gf = gzip.GzipFile(fileobj=bi, mode="rb")
html = gf.read()
bsObj = BeautifulSoup(html)
if __name__ == '__main__':
getLinks(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment