Zylvian/fetching.py

## fetching.py
import itertools
import string

import requests
import logging as log
from constants import Constants
"""
S = requests.Session()

PARAMS = {
    "action": "query",
    "format": "json",
    "generator": "allpages",
    "gtitles":"Luffy",
    "list": "allimages"
}

url = 'https://onepiece.fandom.com/api.php'

print(requests.get(url=url, params=PARAMS).content)
"""

class Fetcher:

    def __init__(self, wiki_name):

        self.constants = Constants()

        wiki_site = 'https://{wiki_name}.fandom.com'.format(wiki_name=wiki_name)

        self._querystartlink =  wiki_site + '/api/v1/Search/List?query='
        self._queryendlink = '&limit=1&minArticleQuality=10&batch=1&namespaces=0%2C14'

        self._imagestartlink = wiki_site + '/api.php?format=json&action=imageserving&wisId='
        self._summarystartlink = wiki_site + "/api/v1/Articles/AsSimpleJson?id="


        #self._endlink = self._startlink+'query&'

        # '&prop=info&inprop=url&generator=allpages&gapfromSS='

    def get_wiki_pages(self, names):
        pages = []

        for name in names:
            try:
                pages.append(self.__fetch_page(name))
            except KeyError:
                pass

        log.info("Input names: " + ",".join(names))

        return pages

    def cleanName(self, name):
        """ignore all special characters, numbers, whitespace, case"""
        return ''.join(c for c in name.lower() if c in string.ascii_lowercase)

    def __get_correct_page(self, checked_name, all_pages):
        # Gets first page
        first_page = None
        log_string = ""

        #clean_name = self.cleanName(checked_name)
        clean_name = checked_name.replace(" ", "+")

        # Checks for any direct hits.
        # difflib.get_close_matches[0]
        for nr, page in enumerate(all_pages.values()):
            title = page['title']
            title_clean = self.cleanName(title)
            log_string += title + ","
            if title_clean == clean_name:
                log.info("Found direct match, page nr {}: {}".format(nr + 1, clean_name))
                first_page = page
                break


        # Get first containing
        # if not first_page:
        #     pages = all_pages.values()
        #     pages_containing = [page for page in pages if checked_name in page['title'].lower()]
        #     if pages_containing:
        #         first_page = pages_containing[0]
        #         print("bingo")

        # Gets first entry
        if not first_page:
            first_page = next(iter(all_pages.values()))

        log.info("Input name: {} \n Parsed titles were: {}.\n Result title was: {}".format(checked_name, log_string[:-1],
                                                                                           first_page["title"]))

        return first_page


    def __fetch_page(self, name):

        # Returns translated name or the same name
        #clean_name = self.cleanName(name)
        checked_name = self.constants.translateAlt(name.lower())

        # All pages with "name" in there, and their URLs.
        fetch_json = requests.get(self._querystartlink + checked_name.title()
                                  ).json() #'Use "gapfilterredir=nonredirects" option instead of "redirects" when using allpages as a generator' #gaplimit=1


        # Gets the first page
        #all_pages = fetch_json['query']['pages']

        #first_page = self.__get_correct_page(checked_name, all_pages)

        first_page = fetch_json["items"][0]

        return first_page

        # ASSUME THAT THE FIRST LINK IS CORRECT - MIGHT BE REDIRECTION LINK!


        def check_title(self):
            pass

    def fetch_image_url(self, page_id):

        image_json = requests.get(self._imagestartlink+str(page_id)).json()

        try:
            image_url_dirty = image_json["image"]["imageserving"]
            image_url = (image_url_dirty.split("/revision/"))[0]

            return image_url
        except KeyError:
            log.info("Couldn't parse image url")
            return ""

    def fetch_summary(self, page_id):

        fetch_json = requests.get(self._summarystartlink+str(page_id)).json()

        return fetch_json["sections"][0]["content"][0]["text"]


class SpellChecker():
    """Find and fix simple spelling errors.
    based on Peter Norvig
    http://norvig.com/spell-correct.html
    """
    def __init__(self, names):
        self.model = set(names)

    def __known(self, words):
        for w in words:
            if w in self.model:
                return w
        return None

    def __edits(self, word):
        splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes    = (a + b[1:] for a, b in splits if b)
        transposes = (a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1)
        replaces   = (a + c + b[1:] for a, b in splits for c in string.ascii_lowercase if b)
        inserts    = (a + c + b     for a, b in splits for c in string.ascii_lowercase)
        return itertools.chain(deletes, transposes, replaces, inserts)

    def correct(self, word):
        """returns input word or fixed version if found"""
        return self.__known([word]) or self.__known(self.__edits(word)) or word

    """
    # distance 2
    def known_edits2(word):
        return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
    """

# Test
#print(requests.get(startlink+'generator=allpages&gapfrom=Luffy&prop=info').content) #prop=info&inprop=url


#image_json = requests.get(startlink+'generator=allpages&gapfrom=Luffy&prop=images').json()
#print(image_json)
#test_output = image_json['query-continue']['']

#All images from the Monkey D. Luffy page
#print(requests.get('https://onepiece.fandom.com/api.php?format=json&action=query&generator=images&titles=Monkey_D._Luffy&prop=imageinfo').content)
	import itertools
	import string

	import requests
	import logging as log
	from constants import Constants
	"""
	S = requests.Session()

	PARAMS = {
	"action": "query",
	"format": "json",
	"generator": "allpages",
	"gtitles":"Luffy",
	"list": "allimages"
	}

	url = 'https://onepiece.fandom.com/api.php'

	print(requests.get(url=url, params=PARAMS).content)
	"""

	class Fetcher:

	def __init__(self, wiki_name):

	self.constants = Constants()

	wiki_site = 'https://{wiki_name}.fandom.com'.format(wiki_name=wiki_name)

	self._querystartlink = wiki_site + '/api/v1/Search/List?query='
	self._queryendlink = '&limit=1&minArticleQuality=10&batch=1&namespaces=0%2C14'

	self._imagestartlink = wiki_site + '/api.php?format=json&action=imageserving&wisId='
	self._summarystartlink = wiki_site + "/api/v1/Articles/AsSimpleJson?id="


	#self._endlink = self._startlink+'query&'

	# '&prop=info&inprop=url&generator=allpages&gapfromSS='

	def get_wiki_pages(self, names):
	pages = []

	for name in names:
	try:
	pages.append(self.__fetch_page(name))
	except KeyError:
	pass

	log.info("Input names: " + ",".join(names))

	return pages

	def cleanName(self, name):
	"""ignore all special characters, numbers, whitespace, case"""
	return ''.join(c for c in name.lower() if c in string.ascii_lowercase)

	def __get_correct_page(self, checked_name, all_pages):
	# Gets first page
	first_page = None
	log_string = ""

	#clean_name = self.cleanName(checked_name)
	clean_name = checked_name.replace(" ", "+")

	# Checks for any direct hits.
	# difflib.get_close_matches[0]
	for nr, page in enumerate(all_pages.values()):
	title = page['title']
	title_clean = self.cleanName(title)
	log_string += title + ","
	if title_clean == clean_name:
	log.info("Found direct match, page nr {}: {}".format(nr + 1, clean_name))
	first_page = page
	break


	# Get first containing
	# if not first_page:
	# pages = all_pages.values()
	# pages_containing = [page for page in pages if checked_name in page['title'].lower()]
	# if pages_containing:
	# first_page = pages_containing[0]
	# print("bingo")

	# Gets first entry
	if not first_page:
	first_page = next(iter(all_pages.values()))

	log.info("Input name: {} \n Parsed titles were: {}.\n Result title was: {}".format(checked_name, log_string[:-1],
	first_page["title"]))

	return first_page



	def __fetch_page(self, name):

	# Returns translated name or the same name
	#clean_name = self.cleanName(name)
	checked_name = self.constants.translateAlt(name.lower())

	# All pages with "name" in there, and their URLs.
	fetch_json = requests.get(self._querystartlink + checked_name.title()
	).json() #'Use "gapfilterredir=nonredirects" option instead of "redirects" when using allpages as a generator' #gaplimit=1


	# Gets the first page
	#all_pages = fetch_json['query']['pages']

	#first_page = self.__get_correct_page(checked_name, all_pages)

	first_page = fetch_json["items"][0]

	return first_page

	# ASSUME THAT THE FIRST LINK IS CORRECT - MIGHT BE REDIRECTION LINK!



	def check_title(self):
	pass

	def fetch_image_url(self, page_id):

	image_json = requests.get(self._imagestartlink+str(page_id)).json()

	try:
	image_url_dirty = image_json["image"]["imageserving"]
	image_url = (image_url_dirty.split("/revision/"))[0]

	return image_url
	except KeyError:
	log.info("Couldn't parse image url")
	return ""

	def fetch_summary(self, page_id):

	fetch_json = requests.get(self._summarystartlink+str(page_id)).json()

	return fetch_json["sections"][0]["content"][0]["text"]



	class SpellChecker():
	"""Find and fix simple spelling errors.
	based on Peter Norvig
	http://norvig.com/spell-correct.html
	"""
	def __init__(self, names):
	self.model = set(names)

	def __known(self, words):
	for w in words:
	if w in self.model:
	return w
	return None

	def __edits(self, word):
	splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
	deletes = (a + b[1:] for a, b in splits if b)
	transposes = (a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1)
	replaces = (a + c + b[1:] for a, b in splits for c in string.ascii_lowercase if b)
	inserts = (a + c + b for a, b in splits for c in string.ascii_lowercase)
	return itertools.chain(deletes, transposes, replaces, inserts)

	def correct(self, word):
	"""returns input word or fixed version if found"""
	return self.__known([word]) or self.__known(self.__edits(word)) or word

	"""
	# distance 2
	def known_edits2(word):
	return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
	"""

	# Test
	#print(requests.get(startlink+'generator=allpages&gapfrom=Luffy&prop=info').content) #prop=info&inprop=url


	#image_json = requests.get(startlink+'generator=allpages&gapfrom=Luffy&prop=images').json()
	#print(image_json)
	#test_output = image_json['query-continue']['']

	#All images from the Monkey D. Luffy page
	#print(requests.get('https://onepiece.fandom.com/api.php?format=json&action=query&generator=images&titles=Monkey_D._Luffy&prop=imageinfo').content)