Skip to content

Instantly share code, notes, and snippets.

@nmichlo
Last active June 26, 2022 00:34
Show Gist options
  • Save nmichlo/0ccf97a6efedcb75ff421f5a9d240e9d to your computer and use it in GitHub Desktop.
Save nmichlo/0ccf97a6efedcb75ff421f5a9d240e9d to your computer and use it in GitHub Desktop.
available pypi synonyms
# check for available pypi synonyms of a given word
# $ available_pypi_synonyms.py <words...>
import argparse
import itertools
import re
import diskcache as dc
import requests
from bs4 import BeautifulSoup
# ========================================================================= #
# Ansi Colors #
# ========================================================================= #
RST = '\033[0m'
# dark colors
GRY = '\033[90m'
lRED = '\033[91m'
lGRN = '\033[92m'
lYLW = '\033[93m'
lBLU = '\033[94m'
lMGT = '\033[95m'
lCYN = '\033[96m'
WHT = '\033[97m'
# light colors
BLK = '\033[30m'
RED = '\033[31m'
GRN = '\033[32m'
YLW = '\033[33m'
BLU = '\033[34m'
MGT = '\033[35m'
CYN = '\033[36m'
lGRY = '\033[37m'
# ========================================================================= #
# Time #
# ========================================================================= #
SEC = 1
MIN = SEC * 60
HOUR = MIN * 60
DAY = HOUR * 24
WEEK = DAY * 7
# ========================================================================= #
# UTIL #
# ========================================================================= #
# cache data so we dont need to make multiple requests
_CACHE = dc.Cache('_cache_/synonyms')
# fake a request from a browser
_HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
@_CACHE.memoize(expire=1*WEEK)
def fetch_page_content(url):
page = requests.get(url, headers=_HEADERS)
return page
@_CACHE.memoize(expire=1*DAY)
def get_status_code(url):
return requests.head(url, headers=_HEADERS).status_code
def normalize(name):
# https://www.python.org/dev/peps/pep-0503/#normalized-names
return re.sub(r"[-_. ]+", "-", name).lower()
# ========================================================================= #
# Synonyms #
# ========================================================================= #
def fetch_synonyms_page_words(word: str, page=1):
# normalise the word
normalised_word = normalize(word)
assert normalised_word.isidentifier(), f'normalised word: {repr(word)} -> {repr(normalised_word)} is not an isidentifier!'
# fetch the content
url = 'https://www.powerthesaurus.org/{word}/synonyms/{page}'.format(word=normalised_word, page=page)
page = fetch_page_content(url)
assert page.status_code == 200, f'synonyms response returned wrong status_code: {page.status_code}, should be 200'
# parse the page & extract synonyms
soup = BeautifulSoup(page.content, 'html.parser')
synonym_blocks = soup.find_all('div', id="primary-area", recursive=True)
synonyms = [block.find('a').text for block in synonym_blocks]
# status
return synonyms
def yield_synonyms(word: str, pages=1):
unique = set()
# for each page, check that added words are unique
for page in range(pages):
words = fetch_synonyms_page_words(word=word, page=page+1)
if not (set(words) - unique):
break
for word in words:
if word not in unique:
yield word
unique.add(word)
# ========================================================================= #
# Pypi #
# ========================================================================= #
def yield_pypi_synonyms(word, pages=1):
for synonym in itertools.chain([word], yield_synonyms(word, pages=pages)):
normalized_synonym = normalize(synonym)
status_code = get_status_code('https://pypi.org/project/{project}/'.format(project=normalized_synonym))
yield (normalized_synonym, status_code == 404, status_code)
def print_pypi_synonyms(words, pages=1, available_only=False, show_status=False):
for word in words:
title = f'Synonyms for: {repr(word)}:'
# print heading
print('='*len(title))
print(title)
print('='*len(title))
print()
# print words
for i, (synonym, exists, status_code) in enumerate(yield_pypi_synonyms(word, pages=pages)):
status = f" [{status_code:3d}]" if show_status else ""
if exists:
print(f'{i:3d}: ✅{status} {lGRN}{synonym}{RST}')
elif not available_only:
print(f'{i:3d}: ❌{status} {RED}{synonym}{RST}')
# end
print()
# ========================================================================= #
# Entrypoint #
# ========================================================================= #
if __name__ == '__main__':
DEFAULT_WORDS = ['default']
DEFAULT_PAGES = 1
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('words', type=str, nargs='*', default=DEFAULT_WORDS)
parser.add_argument('-p', '--pages', type=bool, default=DEFAULT_PAGES)
parser.add_argument('-a', '--available', action='store_true')
parser.add_argument('-s', '--status', action='store_true')
args = parser.parse_args()
# print everything
print_pypi_synonyms(words=args.words, pages=args.pages, available_only=args.available, show_status=args.status)
# ========================================================================= #
# END #
# ========================================================================= #
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment