simonlindgren/ttd.py

## ttd.py
'''
TOPIC TOP DOCUMENTS
'''

# Required libraries and settings
import re
import pandas as pd
pd.set_option('display.max_colwidth', -1)


# Input
file = open('doc-topics_50.txt', 'r')
lines = file.readlines()

# Make each line into a list of items to prepare for dataframe format
lines = [line.split(" ") for line in lines]

# Make dataframe
df = pd.DataFrame(lines)
df.columns = df.iloc[0] # set first line as column headers
df = df[1:] # remove the first line

# Make new dataframe based on the columns we need
df = df.loc[:,['#doc', 'topic', 'proportion']]

# Keep only the rows with the topic number we want
topic = "3"
df = df.loc[df['topic'] == topic]

# Set a threshold value for when a topic is 'strong'
median = df['proportion'].median() # we can use the median value
percentile = df['proportion'].astype(float).quantile(.75) # or a certain percentile
n_largest = df['proportion'].astype(float).nlargest(20+1).min() # or get, say, the top 20
split = 0.87 # or simply a manually set value
print(median, percentile, split,n_largest) # Inspect the values if needed

# In this example, we choose the median as threshold
# Getting the documents (rows) that are above the threshold for our chosen topic
df = df.loc[df['proportion'].astype(float) > median]
topdocs = df['#doc'].tolist()

# Output
outfile = open('topdocs.txt', 'w')

for doc in topdocs: # Write all doc numbers to file
	#print(doc)
	outfile.write(doc + '\n')
	'''
	TOPIC TOP DOCUMENTS
	'''

	# Required libraries and settings
	import re
	import pandas as pd
	pd.set_option('display.max_colwidth', -1)


	# Input
	file = open('doc-topics_50.txt', 'r')
	lines = file.readlines()

	# Make each line into a list of items to prepare for dataframe format
	lines = [line.split(" ") for line in lines]

	# Make dataframe
	df = pd.DataFrame(lines)
	df.columns = df.iloc[0] # set first line as column headers
	df = df[1:] # remove the first line

	# Make new dataframe based on the columns we need
	df = df.loc[:,['#doc', 'topic', 'proportion']]

	# Keep only the rows with the topic number we want
	topic = "3"
	df = df.loc[df['topic'] == topic]

	# Set a threshold value for when a topic is 'strong'
	median = df['proportion'].median() # we can use the median value
	percentile = df['proportion'].astype(float).quantile(.75) # or a certain percentile
	n_largest = df['proportion'].astype(float).nlargest(20+1).min() # or get, say, the top 20
	split = 0.87 # or simply a manually set value
	print(median, percentile, split,n_largest) # Inspect the values if needed

	# In this example, we choose the median as threshold
	# Getting the documents (rows) that are above the threshold for our chosen topic
	df = df.loc[df['proportion'].astype(float) > median]
	topdocs = df['#doc'].tolist()

	# Output
	outfile = open('topdocs.txt', 'w')

	for doc in topdocs: # Write all doc numbers to file
	#print(doc)
	outfile.write(doc + '\n')