Skip to content

Instantly share code, notes, and snippets.

@GlulkAlex
Created July 9, 2018 12:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GlulkAlex/9ef33d2e16f966105362c80bc86dd6ff to your computer and use it in GitHub Desktop.
Save GlulkAlex/9ef33d2e16f966105362c80bc86dd6ff to your computer and use it in GitHub Desktop.
Example of web scraping with Python and PyMongo
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#utf_8 U8, UTF, utf8
"""read_from_json.py:
Example:
of Python's
web scraping
with PyMongo
"""
__author__ = "GlukAlex"
import pymongo
import sys
import json
import requests
#import urllib.request
# AttributeError: 'module' object has no attribute 'request'
#import urllib
# ImportError: No module named 'urllib2'
#import urllib2
"""Note
The 'urllib2' module
has been split across several modules
in Python 3 named
'urllib.request' and
'urllib.error'.
The '2to3 tool' will
automatically adapt `imports`
when converting your sources to Python 3.
"""
"""Note
The 'urllib' module
has been split into parts and
renamed in Python 3 to
'urllib.request',
'urllib.parse', and
'urllib.error'.
The '2to3 tool' will
automatically adapt imports
when converting your sources to Python 3.
Also note that
the 'urllib.request.urlopen()' function
in Python 3 is equivalent to
'urllib2.urlopen()' and that
'urllib.urlopen()' has been removed.
"""
def get_N_Insert_Page_Content(url: str = ""):
# connect to dataBase
client = pymongo.MongoClient(
'mongodb://localhost'
#'localhost',
# default port from MongoDB config files
# for server to listen to
#27017
)
connection = client
# attach to 'reddit' database
db = connection.reddit
# handle to 'stories' collection
stories = (
db.stories
)
# clear / delete entirly existing collection
stories.drop()
# JSON Response Content
# There’s also
# a builtin JSON decoder,
# in case you’re dealing with JSON data:
#>>> import requests
#>>> r = requests.get('https://api.github.com/events')
#>>> r.json()
# In case
# the JSON `decoding` fails,
# 'r.json' raises an exception.
# For example,
# if the `response` gets a '204' (No Content), or
# if the `response` contains `invalid` JSON,
# attempting 'r.json' raises
#ValueError: No JSON object could be decoded.
# get specified web page
reddit_page = (
#"https://www.reddit.com/r/technology/.json"
"http://localhost:8888/files/PyMongo/reddit_com_technology.json"
)
#urllib.request.urlopen(
# url,
# data=None,
# [timeout, ]*,
# cafile=None,
# capath=None,
# cadefault=False,
# context=None)
# Open the URL url,
# which can be
# either a string or
# a Request object.
page_Content = (
requests.get(reddit_page)
# For 'http' and 'https' `urls`,
# this function returns
# a 'http.client.HTTPResponse' object
# which has
# the following 'HTTPResponse Objects' methods.
#urllib.request.urlopen(reddit_page)
#urllib2.urlopen(reddit_page)
)
# AttributeError: 'bytes' object has no attribute 'read'
# DEBUG
print("""type(page_Content) is: {}""".format(type(page_Content)) )
page_JSON = page_Content.json()
# DEBUG
print("""type(page_JSON) is: {}""".format(type(page_JSON)) )
if type(page_JSON) == str or type(page_JSON) == dict:
#print("""page_JSON is: {0['data']:50}""".format(page_JSON) )
print("""page_JSON is: {}""".format(page_JSON, width = 50) )
#json.load(
# fp,
# cls=None,
# object_hook=None,
# parse_float=None,
# parse_int=None,
# parse_constant=None,
# object_pairs_hook=None,
# **kw)
# Deserialize 'fp'
# (a '.read()' - supporting file-like object
# containing a JSON document) to
# a Python object
# using this conversion table.
#page_Content_Parsed = (
#json.load(
# An 'HTTPResponse' instance
# wraps the `HTTP response` from the `server`.
# It provides
# access to
# the `request headers` and
# the `entity body`.
# The response is
# an `iterable` object and
# can be used in a 'with' statement.
#HTTPResponse.read([amt])
# Reads and returns
# the `response body`, or
# up to the next 'amt' bytes.
#page_Content.read()
#page_Content.json()
#)
#)
# Receiving
# a status '429' is
# not an error,
# it is the other server "kindly" asking you to
# please stop spamming requests.
#json_dict.get('data').get('children')[0].get('data')
if page_JSON != {'error': 429}:
content_Topics = (
#page_JSON["data"]["children"]
page_JSON.get('data').get('children')
)
# populate 'stories' with page data
print("""populateing 'stories' with page data ...""".format())
# iterate over array of objects
#for item in page_Content_Parsed["data"]["children"]:
for item in content_Topics:
# side effect
#insert_one(document)
# Insert a single document.
if item:
stories.insert_one(item["data"])
# unit test
if __name__ == "__main__":
# OK ?
get_N_Insert_Page_Content()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment