GlulkAlex/read_from_json.py

## read_from_json.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#utf_8 U8, UTF, utf8

"""read_from_json.py:

Example:
of Python's
web scraping
with PyMongo
"""

__author__ = "GlukAlex"

import pymongo
import sys
import json
import requests
#import urllib.request
# AttributeError: 'module' object has no attribute 'request'
#import urllib
# ImportError: No module named 'urllib2'
#import urllib2
"""Note
The 'urllib2' module
has been split across several modules
in Python 3 named
'urllib.request' and
'urllib.error'.
The '2to3 tool' will
automatically adapt `imports`
when converting your sources to Python 3.
"""
"""Note
The 'urllib' module
has been split into parts and
renamed in Python 3 to
'urllib.request',
'urllib.parse', and
'urllib.error'.
The '2to3 tool' will
automatically adapt imports
when converting your sources to Python 3.
Also note that
the 'urllib.request.urlopen()' function
in Python 3 is equivalent to
'urllib2.urlopen()' and that
'urllib.urlopen()' has been removed.
"""

def get_N_Insert_Page_Content(url: str = ""):
    # connect to dataBase
    client = pymongo.MongoClient(
        'mongodb://localhost'
        #'localhost',
        # default port from MongoDB config files
        # for server to listen to
        #27017
    )
    connection = client

    # attach to 'reddit' database
    db = connection.reddit

    # handle to 'stories' collection
    stories = (
        db.stories
    )

    # clear / delete entirly existing collection
    stories.drop()

    # JSON Response Content
    # There’s also
    # a builtin JSON decoder,
    # in case you’re dealing with JSON data:
    #>>> import requests
    #>>> r = requests.get('https://api.github.com/events')
    #>>> r.json()
    # In case
    # the JSON `decoding` fails,
    # 'r.json' raises an exception.
    # For example,
    # if the `response` gets a '204' (No Content), or
    # if the `response` contains `invalid` JSON,
    # attempting 'r.json' raises
    #ValueError: No JSON object could be decoded.

    # get specified web page
    reddit_page = (
        #"https://www.reddit.com/r/technology/.json"
        "http://localhost:8888/files/PyMongo/reddit_com_technology.json"
    )
    #urllib.request.urlopen(
    #    url,
    #    data=None,
    #    [timeout, ]*,
    #    cafile=None,
    #    capath=None,
    #    cadefault=False,
    #    context=None)
    # Open the URL url,
    # which can be
    # either a string or
    # a Request object.
    page_Content = (
        requests.get(reddit_page)
        # For 'http' and 'https' `urls`,
        # this function returns
        # a 'http.client.HTTPResponse' object
        # which has
        # the following 'HTTPResponse Objects' methods.
        #urllib.request.urlopen(reddit_page)
        #urllib2.urlopen(reddit_page)
        )

    # AttributeError: 'bytes' object has no attribute 'read'
    # DEBUG
    print("""type(page_Content) is: {}""".format(type(page_Content)) )

    page_JSON = page_Content.json()
    # DEBUG
    print("""type(page_JSON) is: {}""".format(type(page_JSON)) )
    if type(page_JSON) == str or type(page_JSON) == dict:
        #print("""page_JSON is: {0['data']:50}""".format(page_JSON) )
        print("""page_JSON is: {}""".format(page_JSON, width = 50) )

    #json.load(
    #    fp,
    #    cls=None,
    #    object_hook=None,
    #    parse_float=None,
    #    parse_int=None,
    #    parse_constant=None,
    #    object_pairs_hook=None,
    #    **kw)
    # Deserialize 'fp'
    # (a '.read()' - supporting file-like object
    # containing a JSON document) to
    # a Python object
    # using this conversion table.
    #page_Content_Parsed = (
        #json.load(
            # An 'HTTPResponse' instance
            # wraps the `HTTP response` from the `server`.
            # It provides
            # access to
            # the `request headers` and
            # the `entity body`.
            # The response is
            # an `iterable` object and
            # can be used in a 'with' statement.
            #HTTPResponse.read([amt])
            # Reads and returns
            # the `response body`, or
            # up to the next 'amt' bytes.
            #page_Content.read()
            #page_Content.json()
        #)
    #)

    # Receiving
    # a status '429' is
    # not an error,
    # it is the other server "kindly" asking you to
    # please stop spamming requests.
    #json_dict.get('data').get('children')[0].get('data')
    if page_JSON != {'error': 429}:
        content_Topics = (
            #page_JSON["data"]["children"]
            page_JSON.get('data').get('children')
        )
        # populate 'stories' with page data
        print("""populateing 'stories' with page data ...""".format())
        # iterate over array of objects
        #for item in page_Content_Parsed["data"]["children"]:
        for item in content_Topics:
            # side effect
            #insert_one(document)
            # Insert a single document.
            if item:
                stories.insert_one(item["data"])

# unit test
if __name__ == "__main__":
    # OK ?
    get_N_Insert_Page_Content()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	#utf_8 U8, UTF, utf8

	"""read_from_json.py:

	Example:
	of Python's
	web scraping
	with PyMongo
	"""

	__author__ = "GlukAlex"

	import pymongo
	import sys
	import json
	import requests
	#import urllib.request
	# AttributeError: 'module' object has no attribute 'request'
	#import urllib
	# ImportError: No module named 'urllib2'
	#import urllib2
	"""Note
	The 'urllib2' module
	has been split across several modules
	in Python 3 named
	'urllib.request' and
	'urllib.error'.
	The '2to3 tool' will
	automatically adapt `imports`
	when converting your sources to Python 3.
	"""
	"""Note
	The 'urllib' module
	has been split into parts and
	renamed in Python 3 to
	'urllib.request',
	'urllib.parse', and
	'urllib.error'.
	The '2to3 tool' will
	automatically adapt imports
	when converting your sources to Python 3.
	Also note that
	the 'urllib.request.urlopen()' function
	in Python 3 is equivalent to
	'urllib2.urlopen()' and that
	'urllib.urlopen()' has been removed.
	"""

	def get_N_Insert_Page_Content(url: str = ""):
	# connect to dataBase
	client = pymongo.MongoClient(
	'mongodb://localhost'
	#'localhost',
	# default port from MongoDB config files
	# for server to listen to
	#27017
	)
	connection = client

	# attach to 'reddit' database
	db = connection.reddit

	# handle to 'stories' collection
	stories = (
	db.stories
	)

	# clear / delete entirly existing collection
	stories.drop()

	# JSON Response Content
	# There’s also
	# a builtin JSON decoder,
	# in case you’re dealing with JSON data:
	#>>> import requests
	#>>> r = requests.get('https://api.github.com/events')
	#>>> r.json()
	# In case
	# the JSON `decoding` fails,
	# 'r.json' raises an exception.
	# For example,
	# if the `response` gets a '204' (No Content), or
	# if the `response` contains `invalid` JSON,
	# attempting 'r.json' raises
	#ValueError: No JSON object could be decoded.

	# get specified web page
	reddit_page = (
	#"https://www.reddit.com/r/technology/.json"
	"http://localhost:8888/files/PyMongo/reddit_com_technology.json"
	)
	#urllib.request.urlopen(
	# url,
	# data=None,
	# [timeout, ]*,
	# cafile=None,
	# capath=None,
	# cadefault=False,
	# context=None)
	# Open the URL url,
	# which can be
	# either a string or
	# a Request object.
	page_Content = (
	requests.get(reddit_page)
	# For 'http' and 'https' `urls`,
	# this function returns
	# a 'http.client.HTTPResponse' object
	# which has
	# the following 'HTTPResponse Objects' methods.
	#urllib.request.urlopen(reddit_page)
	#urllib2.urlopen(reddit_page)
	)

	# AttributeError: 'bytes' object has no attribute 'read'
	# DEBUG
	print("""type(page_Content) is: {}""".format(type(page_Content)) )

	page_JSON = page_Content.json()
	# DEBUG
	print("""type(page_JSON) is: {}""".format(type(page_JSON)) )
	if type(page_JSON) == str or type(page_JSON) == dict:
	#print("""page_JSON is: {0['data']:50}""".format(page_JSON) )
	print("""page_JSON is: {}""".format(page_JSON, width = 50) )

	#json.load(
	# fp,
	# cls=None,
	# object_hook=None,
	# parse_float=None,
	# parse_int=None,
	# parse_constant=None,
	# object_pairs_hook=None,
	# **kw)
	# Deserialize 'fp'
	# (a '.read()' - supporting file-like object
	# containing a JSON document) to
	# a Python object
	# using this conversion table.
	#page_Content_Parsed = (
	#json.load(
	# An 'HTTPResponse' instance
	# wraps the `HTTP response` from the `server`.
	# It provides
	# access to
	# the `request headers` and
	# the `entity body`.
	# The response is
	# an `iterable` object and
	# can be used in a 'with' statement.
	#HTTPResponse.read([amt])
	# Reads and returns
	# the `response body`, or
	# up to the next 'amt' bytes.
	#page_Content.read()
	#page_Content.json()
	#)
	#)

	# Receiving
	# a status '429' is
	# not an error,
	# it is the other server "kindly" asking you to
	# please stop spamming requests.
	#json_dict.get('data').get('children')[0].get('data')
	if page_JSON != {'error': 429}:
	content_Topics = (
	#page_JSON["data"]["children"]
	page_JSON.get('data').get('children')
	)
	# populate 'stories' with page data
	print("""populateing 'stories' with page data ...""".format())
	# iterate over array of objects
	#for item in page_Content_Parsed["data"]["children"]:
	for item in content_Topics:
	# side effect
	#insert_one(document)
	# Insert a single document.
	if item:
	stories.insert_one(item["data"])

	# unit test
	if __name__ == "__main__":
	# OK ?
	get_N_Insert_Page_Content()