Skip to content

Instantly share code, notes, and snippets.

@ajbrock
Last active November 8, 2018 17:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ajbrock/0ede94afed21325e2cbf87e0e50763d6 to your computer and use it in GitHub Desktop.
Save ajbrock/0ede94afed21325e2cbf87e0e50763d6 to your computer and use it in GitHub Desktop.
import numpy as np
# Corpus available here: https://pastebin.com/WqD6fAgu
# Corpus taken from https://dominionstrategy.com/all-cards/
# Read all cards into memory
with open('dominion_cards.html', 'r') as rfile:
x = rfile.readlines()
# Convenience function to count words, used later
def count_words(text):
# initialize count
count = 0
# Count the number of + and $
for item in ['+', '$', '-']:
count += text.count(item)
text = text.replace(item, '')
# Replace the slashes with spaces
text = text.replace('/', ' ')
# Split based on spaces, then with
# count the remaining words
split_text = [item for item in text.split(' ') if any(item)]
count += len(split_text)
return count
# Extract cards and format pythonically
c = []
i = -1
active=False
weasels = ['\n']
for s in x:
# The <tr> indicates the start of a card
if '<tr>' in s:
active = True
i += 1
c += [[]]
elif '</tr>' in s:
active = False
elif active:
ii = 0
# Replace all html tags
while s.find('<') != -1:
ii +=1
if ii > 100:
print('breaking for safety')
break
low, high = s.find('<'), s.find('>')
s = s[:low] + s[high + 1:]
for word in weasels:
s = s.replace(word, '')
s = s.replace('Victory Points', 'VP')
s = s.replace('Victory Point', 'VP')
c[-1] += [s]
# Toss landmarks and other somesuch so and sos
c = [item for item in c if not any([word in item[1] for word in 'Boon', 'Landmark', 'Hex', 'State', 'Event', 'Castle', 'Ruins', 'Shelter'])]
txt = [', '.join(item[3:]) for item in c]
# Count words
counts = [count_words(item) for item in txt]
order = np.argsort(counts)
np.asarray(counts)[order[:10]]
# Print out num_display cards and their wordcounts; display more than ten so we can skip ones with mistakes or errors
num_display = 25
print('Displaying 25 cards with lowest number of words...')
for i in range(num_display):
print('#%d: %s, %s' % (i, np.asarray(c)[order[i]], np.asarray(counts)[order[i]]))
# Which ones I select
print('-------------------------------------')
print('-------------------------------------')
# print('My selected cards, and the total count:')
my_indices = [4, 7, 8, 9, 10, 11, 12, 13, 14, 15]
for num, i in enumerate(my_indices):
print('#%d: %s, %s' % (num + 1, np.asarray(c)[order[i]], np.asarray(counts)[order[i]]))
print('Sum of all words is %d' % sum([counts[order[index]] for index in my_indices]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment