mitchelljkotler/client.py Secret

## client.py
"""
This defines the DocumentCloud client to be imported into the other scripts
Make sure you have python-documentcloud >= 2.2.0

These scripts use the 20 newsgroup data, found here:
http://qwone.com/~jason/20Newsgroups/

You may download that if you would like to test or full free to modify
to fit your own data
"""
import logging
import os

from documentcloud import DocumentCloud

# set your username here
USERNAME = "mitch"
# put your password into an environment variable to not save it to a file
PASSWORD = os.environ.get("DC_PASSWORD")

# set your project ID
PROJECT_ID = 204306
# set the tag name you are using to mark documents as in the set or not
# use the values "true" and "false"
TAG_NAME = "politics"
# these are the folders which contain your documents
NEWSGROUPS = ["talk.politics.misc", "rec.autos"]

client = DocumentCloud(
    username=USERNAME,
    password=PASSWORD,
    # this enables verbose logging
    # feel free to remove or change the level
    # to suit your needs
    loglevel=logging.INFO,
    timeout=30,
)

## learn.py
"""
Run this script third

Be sure to mark some documents as being in or out of the interested set
by setting the TAG_NAME data to true or false on some of the documents
This can be done via the web interface or via the API (example not provided)

This script will run the metric lego learning, which will assign a score to each
document under the data {TAG_NAME}_score
"""
import time

from client import PROJECT_ID, TAG_NAME, client

print("Starting...")

# do the learning!
response = client.post(
    f"projects/{PROJECT_ID}/sidekick/learn/", data={"tagname": TAG_NAME}
)
print(response.status_code)
print(response.json())

# check the status
response = client.get(f"projects/{PROJECT_ID}/sidekick/")
status = response.json()["status"]
print(status)

# keep checking the status until it succeeds
while status == "pending":
    # wait ten seconds before checking again
    time.sleep(10)
    response = client.get(f"projects/{PROJECT_ID}/sidekick/")
    status = response.json()["status"]
    print(status)

print("Done!")

## score.py
"""
Run this script fourth

This will return some statistics on your documents.  If you have pre-labelled
documents, you can get a sense of how well the scoring is doing.  If it is not
good enough, you can try marking more of your documents as being in or out of the set,
then re-running the metric lego learning (learn.py).

If your documents are not pre-labelled, you could modify this to sort by most
interesting documents, view those, manually confirm if they are labelled correctly,
and then re-run learn.py to continue finding the most relevant documents
"""

import statistics

from client import NEWSGROUPS, PROJECT_ID, TAG_NAME, client

print("Starting...")

for newsgroup in NEWSGROUPS:
    print(f"Newsgroup {newsgroup}...")
    # find all of the documents we uploaded from this folder
    documents = client.documents.search(
        "", project=PROJECT_ID, data_newsgroup=newsgroup,
    )

    scores = []
    count = len(documents)
    # collect all of the scores for these documents
    for document in documents:
        scores.append(float(document.data[f"{TAG_NAME}_score"][0]))
    # calculate some statistics to see how well the scorer did
    print(f"Count: {count}")
    print(f"Mean: {statistics.mean(scores)}")
    print(f"Median: {statistics.median(scores)}")
    print(f"Min: {min(scores)}")
    print(f"Max: {max(scores)}")
    print(f"Std Dev: {statistics.stdev(scores)}")


print("Done!")

## sidekick.py
"""
Run this script second
It will initialize the sidekick instance and preprocess all the files in the project
This may take some time depending on the number of documents in the project

If you add or remove documents from your project, you should re-run this script
to re-analyze your documents
"""
import time

from client import PROJECT_ID, client

print("Starting...")

# create the sidekick on the project
response = client.post(f"projects/{PROJECT_ID}/sidekick/")

# check the status
response = client.get(f"projects/{PROJECT_ID}/sidekick/")
status = response.json()["status"]
print(status)

# keep checking the status until it succeeds
while status == "pending":
    # wait one minute before checking again
    time.sleep(60)
    response = client.get(f"projects/{PROJECT_ID}/sidekick/")
    status = response.json()["status"]
    print(status)

print("Done!")

## upload.py
"""
Run this script first
It will upload all of your documents into the project
"""

from client import NEWSGROUPS, PROJECT_ID, client

print("Starting...")

print("Uploading...")

for newsgroup in NEWSGROUPS:
    print(f"Newsgroup {newsgroup}...")
    client.documents.upload_directory(
        newsgroup,
        # my documents have no extension
        # It will default to pdf
        extension=None,
        # we set a data value so we know which folder each document came from
        # if your documents are not already categorized you may skip this
        data={"newsgroup": newsgroup},
        project=PROJECT_ID,
        # I have text files, so I set the original extension to txt
        # even though the files are not named with a txt extension
        original_extension="txt",
        handle_errors=True,
    )

print("Done!")
	"""
	This defines the DocumentCloud client to be imported into the other scripts
	Make sure you have python-documentcloud >= 2.2.0

	These scripts use the 20 newsgroup data, found here:
	http://qwone.com/~jason/20Newsgroups/

	You may download that if you would like to test or full free to modify
	to fit your own data
	"""
	import logging
	import os

	from documentcloud import DocumentCloud

	# set your username here
	USERNAME = "mitch"
	# put your password into an environment variable to not save it to a file
	PASSWORD = os.environ.get("DC_PASSWORD")

	# set your project ID
	PROJECT_ID = 204306
	# set the tag name you are using to mark documents as in the set or not
	# use the values "true" and "false"
	TAG_NAME = "politics"
	# these are the folders which contain your documents
	NEWSGROUPS = ["talk.politics.misc", "rec.autos"]

	client = DocumentCloud(
	username=USERNAME,
	password=PASSWORD,
	# this enables verbose logging
	# feel free to remove or change the level
	# to suit your needs
	loglevel=logging.INFO,
	timeout=30,
	)
	"""
	Run this script third

	Be sure to mark some documents as being in or out of the interested set
	by setting the TAG_NAME data to true or false on some of the documents
	This can be done via the web interface or via the API (example not provided)

	This script will run the metric lego learning, which will assign a score to each
	document under the data {TAG_NAME}_score
	"""
	import time

	from client import PROJECT_ID, TAG_NAME, client

	print("Starting...")

	# do the learning!
	response = client.post(
	f"projects/{PROJECT_ID}/sidekick/learn/", data={"tagname": TAG_NAME}
	)
	print(response.status_code)
	print(response.json())

	# check the status
	response = client.get(f"projects/{PROJECT_ID}/sidekick/")
	status = response.json()["status"]
	print(status)

	# keep checking the status until it succeeds
	while status == "pending":
	# wait ten seconds before checking again
	time.sleep(10)
	response = client.get(f"projects/{PROJECT_ID}/sidekick/")
	status = response.json()["status"]
	print(status)

	print("Done!")
	"""
	Run this script fourth

	This will return some statistics on your documents. If you have pre-labelled
	documents, you can get a sense of how well the scoring is doing. If it is not
	good enough, you can try marking more of your documents as being in or out of the set,
	then re-running the metric lego learning (learn.py).

	If your documents are not pre-labelled, you could modify this to sort by most
	interesting documents, view those, manually confirm if they are labelled correctly,
	and then re-run learn.py to continue finding the most relevant documents
	"""

	import statistics

	from client import NEWSGROUPS, PROJECT_ID, TAG_NAME, client

	print("Starting...")

	for newsgroup in NEWSGROUPS:
	print(f"Newsgroup {newsgroup}...")
	# find all of the documents we uploaded from this folder
	documents = client.documents.search(
	"", project=PROJECT_ID, data_newsgroup=newsgroup,
	)

	scores = []
	count = len(documents)
	# collect all of the scores for these documents
	for document in documents:
	scores.append(float(document.data[f"{TAG_NAME}_score"][0]))
	# calculate some statistics to see how well the scorer did
	print(f"Count: {count}")
	print(f"Mean: {statistics.mean(scores)}")
	print(f"Median: {statistics.median(scores)}")
	print(f"Min: {min(scores)}")
	print(f"Max: {max(scores)}")
	print(f"Std Dev: {statistics.stdev(scores)}")


	print("Done!")
	"""
	Run this script second
	It will initialize the sidekick instance and preprocess all the files in the project
	This may take some time depending on the number of documents in the project

	If you add or remove documents from your project, you should re-run this script
	to re-analyze your documents
	"""
	import time

	from client import PROJECT_ID, client

	print("Starting...")

	# create the sidekick on the project
	response = client.post(f"projects/{PROJECT_ID}/sidekick/")

	# check the status
	response = client.get(f"projects/{PROJECT_ID}/sidekick/")
	status = response.json()["status"]
	print(status)

	# keep checking the status until it succeeds
	while status == "pending":
	# wait one minute before checking again
	time.sleep(60)
	response = client.get(f"projects/{PROJECT_ID}/sidekick/")
	status = response.json()["status"]
	print(status)

	print("Done!")
	"""
	Run this script first
	It will upload all of your documents into the project
	"""

	from client import NEWSGROUPS, PROJECT_ID, client

	print("Starting...")

	print("Uploading...")

	for newsgroup in NEWSGROUPS:
	print(f"Newsgroup {newsgroup}...")
	client.documents.upload_directory(
	newsgroup,
	# my documents have no extension
	# It will default to pdf
	extension=None,
	# we set a data value so we know which folder each document came from
	# if your documents are not already categorized you may skip this
	data={"newsgroup": newsgroup},
	project=PROJECT_ID,
	# I have text files, so I set the original extension to txt
	# even though the files are not named with a txt extension
	original_extension="txt",
	handle_errors=True,
	)

	print("Done!")