Skip to content

Instantly share code, notes, and snippets.

@mitchelljkotler
Created August 5, 2021 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mitchelljkotler/903711854ada0002163fa2dd82c13b11 to your computer and use it in GitHub Desktop.
Save mitchelljkotler/903711854ada0002163fa2dd82c13b11 to your computer and use it in GitHub Desktop.
Sidekick test scripts
"""
This defines the DocumentCloud client to be imported into the other scripts
Make sure you have python-documentcloud >= 2.2.0
These scripts use the 20 newsgroup data, found here:
http://qwone.com/~jason/20Newsgroups/
You may download that if you would like to test or full free to modify
to fit your own data
"""
import logging
import os
from documentcloud import DocumentCloud
# set your username here
USERNAME = "mitch"
# put your password into an environment variable to not save it to a file
PASSWORD = os.environ.get("DC_PASSWORD")
# set your project ID
PROJECT_ID = 204306
# set the tag name you are using to mark documents as in the set or not
# use the values "true" and "false"
TAG_NAME = "politics"
# these are the folders which contain your documents
NEWSGROUPS = ["talk.politics.misc", "rec.autos"]
client = DocumentCloud(
username=USERNAME,
password=PASSWORD,
# this enables verbose logging
# feel free to remove or change the level
# to suit your needs
loglevel=logging.INFO,
timeout=30,
)
"""
Run this script third
Be sure to mark some documents as being in or out of the interested set
by setting the TAG_NAME data to true or false on some of the documents
This can be done via the web interface or via the API (example not provided)
This script will run the metric lego learning, which will assign a score to each
document under the data {TAG_NAME}_score
"""
import time
from client import PROJECT_ID, TAG_NAME, client
print("Starting...")
# do the learning!
response = client.post(
f"projects/{PROJECT_ID}/sidekick/learn/", data={"tagname": TAG_NAME}
)
print(response.status_code)
print(response.json())
# check the status
response = client.get(f"projects/{PROJECT_ID}/sidekick/")
status = response.json()["status"]
print(status)
# keep checking the status until it succeeds
while status == "pending":
# wait ten seconds before checking again
time.sleep(10)
response = client.get(f"projects/{PROJECT_ID}/sidekick/")
status = response.json()["status"]
print(status)
print("Done!")
"""
Run this script fourth
This will return some statistics on your documents. If you have pre-labelled
documents, you can get a sense of how well the scoring is doing. If it is not
good enough, you can try marking more of your documents as being in or out of the set,
then re-running the metric lego learning (learn.py).
If your documents are not pre-labelled, you could modify this to sort by most
interesting documents, view those, manually confirm if they are labelled correctly,
and then re-run learn.py to continue finding the most relevant documents
"""
import statistics
from client import NEWSGROUPS, PROJECT_ID, TAG_NAME, client
print("Starting...")
for newsgroup in NEWSGROUPS:
print(f"Newsgroup {newsgroup}...")
# find all of the documents we uploaded from this folder
documents = client.documents.search(
"", project=PROJECT_ID, data_newsgroup=newsgroup,
)
scores = []
count = len(documents)
# collect all of the scores for these documents
for document in documents:
scores.append(float(document.data[f"{TAG_NAME}_score"][0]))
# calculate some statistics to see how well the scorer did
print(f"Count: {count}")
print(f"Mean: {statistics.mean(scores)}")
print(f"Median: {statistics.median(scores)}")
print(f"Min: {min(scores)}")
print(f"Max: {max(scores)}")
print(f"Std Dev: {statistics.stdev(scores)}")
print("Done!")
"""
Run this script second
It will initialize the sidekick instance and preprocess all the files in the project
This may take some time depending on the number of documents in the project
If you add or remove documents from your project, you should re-run this script
to re-analyze your documents
"""
import time
from client import PROJECT_ID, client
print("Starting...")
# create the sidekick on the project
response = client.post(f"projects/{PROJECT_ID}/sidekick/")
# check the status
response = client.get(f"projects/{PROJECT_ID}/sidekick/")
status = response.json()["status"]
print(status)
# keep checking the status until it succeeds
while status == "pending":
# wait one minute before checking again
time.sleep(60)
response = client.get(f"projects/{PROJECT_ID}/sidekick/")
status = response.json()["status"]
print(status)
print("Done!")
"""
Run this script first
It will upload all of your documents into the project
"""
from client import NEWSGROUPS, PROJECT_ID, client
print("Starting...")
print("Uploading...")
for newsgroup in NEWSGROUPS:
print(f"Newsgroup {newsgroup}...")
client.documents.upload_directory(
newsgroup,
# my documents have no extension
# It will default to pdf
extension=None,
# we set a data value so we know which folder each document came from
# if your documents are not already categorized you may skip this
data={"newsgroup": newsgroup},
project=PROJECT_ID,
# I have text files, so I set the original extension to txt
# even though the files are not named with a txt extension
original_extension="txt",
handle_errors=True,
)
print("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment