nmichlo/available_pypi_synonyms.py

## available_pypi_synonyms.py
# check for available pypi synonyms of a given word
# $ available_pypi_synonyms.py <words...>

import argparse
import itertools
import re

import diskcache as dc
import requests
from bs4 import BeautifulSoup


# ========================================================================= #
# Ansi Colors                                                               #
# ========================================================================= #


RST = '\033[0m'

# dark colors
GRY = '\033[90m'
lRED = '\033[91m'
lGRN = '\033[92m'
lYLW = '\033[93m'
lBLU = '\033[94m'
lMGT = '\033[95m'
lCYN = '\033[96m'
WHT = '\033[97m'

# light colors
BLK = '\033[30m'
RED = '\033[31m'
GRN = '\033[32m'
YLW = '\033[33m'
BLU = '\033[34m'
MGT = '\033[35m'
CYN = '\033[36m'
lGRY = '\033[37m'


# ========================================================================= #
# Time                                                                      #
# ========================================================================= #

SEC = 1
MIN = SEC * 60
HOUR = MIN * 60
DAY = HOUR * 24
WEEK = DAY * 7

# ========================================================================= #
# UTIL                                                                      #
# ========================================================================= #


# cache data so we dont need to make multiple requests
_CACHE = dc.Cache('_cache_/synonyms')
# fake a request from a browser
_HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}


@_CACHE.memoize(expire=1*WEEK)
def fetch_page_content(url):
    page = requests.get(url, headers=_HEADERS)
    return page


@_CACHE.memoize(expire=1*DAY)
def get_status_code(url):
    return requests.head(url, headers=_HEADERS).status_code


def normalize(name):
    # https://www.python.org/dev/peps/pep-0503/#normalized-names
    return re.sub(r"[-_. ]+", "-", name).lower()


# ========================================================================= #
# Synonyms                                                                  #
# ========================================================================= #


def fetch_synonyms_page_words(word: str, page=1):
    # normalise the word
    normalised_word = normalize(word)
    assert normalised_word.isidentifier(), f'normalised word: {repr(word)} -> {repr(normalised_word)} is not an isidentifier!'
    # fetch the content
    url = 'https://www.powerthesaurus.org/{word}/synonyms/{page}'.format(word=normalised_word, page=page)
    page = fetch_page_content(url)
    assert page.status_code == 200, f'synonyms response returned wrong status_code: {page.status_code}, should be 200'
    # parse the page & extract synonyms
    soup = BeautifulSoup(page.content, 'html.parser')
    synonym_blocks = soup.find_all('div', id="primary-area", recursive=True)
    synonyms = [block.find('a').text for block in synonym_blocks]
    # status
    return synonyms


def yield_synonyms(word: str, pages=1):
    unique = set()
    # for each page, check that added words are unique
    for page in range(pages):
        words = fetch_synonyms_page_words(word=word, page=page+1)
        if not (set(words) - unique):
            break
        for word in words:
            if word not in unique:
                yield word
            unique.add(word)


# ========================================================================= #
# Pypi                                                                      #
# ========================================================================= #


def yield_pypi_synonyms(word, pages=1):
    for synonym in itertools.chain([word], yield_synonyms(word, pages=pages)):
        normalized_synonym = normalize(synonym)
        status_code = get_status_code('https://pypi.org/project/{project}/'.format(project=normalized_synonym))
        yield (normalized_synonym, status_code == 404, status_code)


def print_pypi_synonyms(words, pages=1, available_only=False, show_status=False):
    for word in words:
        title = f'Synonyms for: {repr(word)}:'
        # print heading
        print('='*len(title))
        print(title)
        print('='*len(title))
        print()
        # print words
        for i, (synonym, exists, status_code) in enumerate(yield_pypi_synonyms(word, pages=pages)):
            status = f" [{status_code:3d}]" if show_status else ""
            if exists:
                print(f'{i:3d}: ✅{status} {lGRN}{synonym}{RST}')
            elif not available_only:
                print(f'{i:3d}: ❌{status} {RED}{synonym}{RST}')
        # end
        print()


# ========================================================================= #
# Entrypoint                                                                #
# ========================================================================= #


if __name__ == '__main__':

    DEFAULT_WORDS = ['default']
    DEFAULT_PAGES = 1

    # parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('words', type=str, nargs='*', default=DEFAULT_WORDS)
    parser.add_argument('-p', '--pages', type=bool, default=DEFAULT_PAGES)
    parser.add_argument('-a', '--available', action='store_true')
    parser.add_argument('-s', '--status', action='store_true')
    args = parser.parse_args()

    # print everything
    print_pypi_synonyms(words=args.words, pages=args.pages, available_only=args.available, show_status=args.status)


# ========================================================================= #
# END                                                                       #
# ========================================================================= #
	# check for available pypi synonyms of a given word
	# $ available_pypi_synonyms.py <words...>

	import argparse
	import itertools
	import re

	import diskcache as dc
	import requests
	from bs4 import BeautifulSoup


	# ========================================================================= #
	# Ansi Colors #
	# ========================================================================= #


	RST = '\033[0m'

	# dark colors
	GRY = '\033[90m'
	lRED = '\033[91m'
	lGRN = '\033[92m'
	lYLW = '\033[93m'
	lBLU = '\033[94m'
	lMGT = '\033[95m'
	lCYN = '\033[96m'
	WHT = '\033[97m'

	# light colors
	BLK = '\033[30m'
	RED = '\033[31m'
	GRN = '\033[32m'
	YLW = '\033[33m'
	BLU = '\033[34m'
	MGT = '\033[35m'
	CYN = '\033[36m'
	lGRY = '\033[37m'


	# ========================================================================= #
	# Time #
	# ========================================================================= #

	SEC = 1
	MIN = SEC * 60
	HOUR = MIN * 60
	DAY = HOUR * 24
	WEEK = DAY * 7

	# ========================================================================= #
	# UTIL #
	# ========================================================================= #


	# cache data so we dont need to make multiple requests
	_CACHE = dc.Cache('_cache_/synonyms')
	# fake a request from a browser
	_HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}


	@_CACHE.memoize(expire=1*WEEK)
	def fetch_page_content(url):
	page = requests.get(url, headers=_HEADERS)
	return page


	@_CACHE.memoize(expire=1*DAY)
	def get_status_code(url):
	return requests.head(url, headers=_HEADERS).status_code


	def normalize(name):
	# https://www.python.org/dev/peps/pep-0503/#normalized-names
	return re.sub(r"[-_. ]+", "-", name).lower()


	# ========================================================================= #
	# Synonyms #
	# ========================================================================= #


	def fetch_synonyms_page_words(word: str, page=1):
	# normalise the word
	normalised_word = normalize(word)
	assert normalised_word.isidentifier(), f'normalised word: {repr(word)} -> {repr(normalised_word)} is not an isidentifier!'
	# fetch the content
	url = 'https://www.powerthesaurus.org/{word}/synonyms/{page}'.format(word=normalised_word, page=page)
	page = fetch_page_content(url)
	assert page.status_code == 200, f'synonyms response returned wrong status_code: {page.status_code}, should be 200'
	# parse the page & extract synonyms
	soup = BeautifulSoup(page.content, 'html.parser')
	synonym_blocks = soup.find_all('div', id="primary-area", recursive=True)
	synonyms = [block.find('a').text for block in synonym_blocks]
	# status
	return synonyms


	def yield_synonyms(word: str, pages=1):
	unique = set()
	# for each page, check that added words are unique
	for page in range(pages):
	words = fetch_synonyms_page_words(word=word, page=page+1)
	if not (set(words) - unique):
	break
	for word in words:
	if word not in unique:
	yield word
	unique.add(word)


	# ========================================================================= #
	# Pypi #
	# ========================================================================= #


	def yield_pypi_synonyms(word, pages=1):
	for synonym in itertools.chain([word], yield_synonyms(word, pages=pages)):
	normalized_synonym = normalize(synonym)
	status_code = get_status_code('https://pypi.org/project/{project}/'.format(project=normalized_synonym))
	yield (normalized_synonym, status_code == 404, status_code)


	def print_pypi_synonyms(words, pages=1, available_only=False, show_status=False):
	for word in words:
	title = f'Synonyms for: {repr(word)}:'
	# print heading
	print('='*len(title))
	print(title)
	print('='*len(title))
	print()
	# print words
	for i, (synonym, exists, status_code) in enumerate(yield_pypi_synonyms(word, pages=pages)):
	status = f" [{status_code:3d}]" if show_status else ""
	if exists:
	print(f'{i:3d}: ✅{status} {lGRN}{synonym}{RST}')
	elif not available_only:
	print(f'{i:3d}: ❌{status} {RED}{synonym}{RST}')
	# end
	print()


	# ========================================================================= #
	# Entrypoint #
	# ========================================================================= #


	if __name__ == '__main__':

	DEFAULT_WORDS = ['default']
	DEFAULT_PAGES = 1

	# parse arguments
	parser = argparse.ArgumentParser()
	parser.add_argument('words', type=str, nargs='*', default=DEFAULT_WORDS)
	parser.add_argument('-p', '--pages', type=bool, default=DEFAULT_PAGES)
	parser.add_argument('-a', '--available', action='store_true')
	parser.add_argument('-s', '--status', action='store_true')
	args = parser.parse_args()

	# print everything
	print_pypi_synonyms(words=args.words, pages=args.pages, available_only=args.available, show_status=args.status)


	# ========================================================================= #
	# END #
	# ========================================================================= #