Last active
January 23, 2017 16:14
-
-
Save jrjames83/eeda4324116dde12ea8d45c47a151ab0 to your computer and use it in GitHub Desktop.
frequencies and text similarity use stdlib SequenceMatcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter, defaultdict | |
from itertools import product | |
from difflib import SequenceMatcher | |
import requests | |
from bs4 import BeautifulSoup | |
from nltk.corpus import wordnet as wn | |
page = requests.get("http://pybit.es/feeds/all.rss.xml") | |
soup = BeautifulSoup(page.text, "lxml") | |
categories = [c.get_text().lower().strip() for c in soup.find_all("category")] | |
terms = set(categories) | |
cat_counter_most = Counter(categories).most_common(10) | |
print "Top 10 Tags:" | |
print "---------------------" | |
for term, freq in cat_counter_most: | |
print term, freq | |
print "Getting Similar Tags" | |
print "---------------------" | |
word_pairs = list(product(terms, terms)) | |
for p1, p2 in word_pairs: | |
ratio = SequenceMatcher(None, p1, p2).ratio() | |
if ratio > .85 and p1 is not p2: | |
print p1, p2, ratio | |
""" | |
C:\Users\Jeffrey\OneDrive\coding\challenges>python pybites_tags.py | |
Top 10 Tags: | |
--------------------- | |
python 12 | |
learning 9 | |
tips 8 | |
github 7 | |
cleancode 6 | |
tricks 6 | |
code challenges 5 | |
scrabble 4 | |
virtualenv 4 | |
pythonic 4 | |
Getting Similar Tags | |
--------------------- | |
generator generators 0.947368421053 | |
generators generator 0.947368421053 | |
challenges challenge 0.947368421053 | |
python pythonic 0.857142857143 | |
game games 0.888888888889 | |
pythonic python 0.857142857143 | |
games game 0.888888888889 | |
challenge challenges 0.947368421053 | |
best practices best-practices 0.928571428571 | |
best-practices best practices 0.928571428571 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment