Skip to content

Instantly share code, notes, and snippets.

Created July 9, 2018 12:12
Show Gist options
  • Save GlulkAlex/9ef33d2e16f966105362c80bc86dd6ff to your computer and use it in GitHub Desktop.
Save GlulkAlex/9ef33d2e16f966105362c80bc86dd6ff to your computer and use it in GitHub Desktop.
Example of web scraping with Python and PyMongo
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#utf_8 U8, UTF, utf8
of Python's
web scraping
with PyMongo
__author__ = "GlukAlex"
import pymongo
import sys
import json
import requests
#import urllib.request
# AttributeError: 'module' object has no attribute 'request'
#import urllib
# ImportError: No module named 'urllib2'
#import urllib2
The 'urllib2' module
has been split across several modules
in Python 3 named
'urllib.request' and
The '2to3 tool' will
automatically adapt `imports`
when converting your sources to Python 3.
The 'urllib' module
has been split into parts and
renamed in Python 3 to
'urllib.parse', and
The '2to3 tool' will
automatically adapt imports
when converting your sources to Python 3.
Also note that
the 'urllib.request.urlopen()' function
in Python 3 is equivalent to
'urllib2.urlopen()' and that
'urllib.urlopen()' has been removed.
def get_N_Insert_Page_Content(url: str = ""):
# connect to dataBase
client = pymongo.MongoClient(
# default port from MongoDB config files
# for server to listen to
connection = client
# attach to 'reddit' database
db = connection.reddit
# handle to 'stories' collection
stories = (
# clear / delete entirly existing collection
# JSON Response Content
# There’s also
# a builtin JSON decoder,
# in case you’re dealing with JSON data:
#>>> import requests
#>>> r = requests.get('')
#>>> r.json()
# In case
# the JSON `decoding` fails,
# 'r.json' raises an exception.
# For example,
# if the `response` gets a '204' (No Content), or
# if the `response` contains `invalid` JSON,
# attempting 'r.json' raises
#ValueError: No JSON object could be decoded.
# get specified web page
reddit_page = (
# url,
# data=None,
# [timeout, ]*,
# cafile=None,
# capath=None,
# cadefault=False,
# context=None)
# Open the URL url,
# which can be
# either a string or
# a Request object.
page_Content = (
# For 'http' and 'https' `urls`,
# this function returns
# a 'http.client.HTTPResponse' object
# which has
# the following 'HTTPResponse Objects' methods.
# AttributeError: 'bytes' object has no attribute 'read'
print("""type(page_Content) is: {}""".format(type(page_Content)) )
page_JSON = page_Content.json()
print("""type(page_JSON) is: {}""".format(type(page_JSON)) )
if type(page_JSON) == str or type(page_JSON) == dict:
#print("""page_JSON is: {0['data']:50}""".format(page_JSON) )
print("""page_JSON is: {}""".format(page_JSON, width = 50) )
# fp,
# cls=None,
# object_hook=None,
# parse_float=None,
# parse_int=None,
# parse_constant=None,
# object_pairs_hook=None,
# **kw)
# Deserialize 'fp'
# (a '.read()' - supporting file-like object
# containing a JSON document) to
# a Python object
# using this conversion table.
#page_Content_Parsed = (
# An 'HTTPResponse' instance
# wraps the `HTTP response` from the `server`.
# It provides
# access to
# the `request headers` and
# the `entity body`.
# The response is
# an `iterable` object and
# can be used in a 'with' statement.[amt])
# Reads and returns
# the `response body`, or
# up to the next 'amt' bytes.
# Receiving
# a status '429' is
# not an error,
# it is the other server "kindly" asking you to
# please stop spamming requests.
if page_JSON != {'error': 429}:
content_Topics = (
# populate 'stories' with page data
print("""populateing 'stories' with page data ...""".format())
# iterate over array of objects
#for item in page_Content_Parsed["data"]["children"]:
for item in content_Topics:
# side effect
# Insert a single document.
if item:
# unit test
if __name__ == "__main__":
# OK ?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment