-
-
Save mitchelljkotler/903711854ada0002163fa2dd82c13b11 to your computer and use it in GitHub Desktop.
Sidekick test scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This defines the DocumentCloud client to be imported into the other scripts | |
Make sure you have python-documentcloud >= 2.2.0 | |
These scripts use the 20 newsgroup data, found here: | |
http://qwone.com/~jason/20Newsgroups/ | |
You may download that if you would like to test or full free to modify | |
to fit your own data | |
""" | |
import logging | |
import os | |
from documentcloud import DocumentCloud | |
# set your username here | |
USERNAME = "mitch" | |
# put your password into an environment variable to not save it to a file | |
PASSWORD = os.environ.get("DC_PASSWORD") | |
# set your project ID | |
PROJECT_ID = 204306 | |
# set the tag name you are using to mark documents as in the set or not | |
# use the values "true" and "false" | |
TAG_NAME = "politics" | |
# these are the folders which contain your documents | |
NEWSGROUPS = ["talk.politics.misc", "rec.autos"] | |
client = DocumentCloud( | |
username=USERNAME, | |
password=PASSWORD, | |
# this enables verbose logging | |
# feel free to remove or change the level | |
# to suit your needs | |
loglevel=logging.INFO, | |
timeout=30, | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Run this script third | |
Be sure to mark some documents as being in or out of the interested set | |
by setting the TAG_NAME data to true or false on some of the documents | |
This can be done via the web interface or via the API (example not provided) | |
This script will run the metric lego learning, which will assign a score to each | |
document under the data {TAG_NAME}_score | |
""" | |
import time | |
from client import PROJECT_ID, TAG_NAME, client | |
print("Starting...") | |
# do the learning! | |
response = client.post( | |
f"projects/{PROJECT_ID}/sidekick/learn/", data={"tagname": TAG_NAME} | |
) | |
print(response.status_code) | |
print(response.json()) | |
# check the status | |
response = client.get(f"projects/{PROJECT_ID}/sidekick/") | |
status = response.json()["status"] | |
print(status) | |
# keep checking the status until it succeeds | |
while status == "pending": | |
# wait ten seconds before checking again | |
time.sleep(10) | |
response = client.get(f"projects/{PROJECT_ID}/sidekick/") | |
status = response.json()["status"] | |
print(status) | |
print("Done!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Run this script fourth | |
This will return some statistics on your documents. If you have pre-labelled | |
documents, you can get a sense of how well the scoring is doing. If it is not | |
good enough, you can try marking more of your documents as being in or out of the set, | |
then re-running the metric lego learning (learn.py). | |
If your documents are not pre-labelled, you could modify this to sort by most | |
interesting documents, view those, manually confirm if they are labelled correctly, | |
and then re-run learn.py to continue finding the most relevant documents | |
""" | |
import statistics | |
from client import NEWSGROUPS, PROJECT_ID, TAG_NAME, client | |
print("Starting...") | |
for newsgroup in NEWSGROUPS: | |
print(f"Newsgroup {newsgroup}...") | |
# find all of the documents we uploaded from this folder | |
documents = client.documents.search( | |
"", project=PROJECT_ID, data_newsgroup=newsgroup, | |
) | |
scores = [] | |
count = len(documents) | |
# collect all of the scores for these documents | |
for document in documents: | |
scores.append(float(document.data[f"{TAG_NAME}_score"][0])) | |
# calculate some statistics to see how well the scorer did | |
print(f"Count: {count}") | |
print(f"Mean: {statistics.mean(scores)}") | |
print(f"Median: {statistics.median(scores)}") | |
print(f"Min: {min(scores)}") | |
print(f"Max: {max(scores)}") | |
print(f"Std Dev: {statistics.stdev(scores)}") | |
print("Done!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Run this script second | |
It will initialize the sidekick instance and preprocess all the files in the project | |
This may take some time depending on the number of documents in the project | |
If you add or remove documents from your project, you should re-run this script | |
to re-analyze your documents | |
""" | |
import time | |
from client import PROJECT_ID, client | |
print("Starting...") | |
# create the sidekick on the project | |
response = client.post(f"projects/{PROJECT_ID}/sidekick/") | |
# check the status | |
response = client.get(f"projects/{PROJECT_ID}/sidekick/") | |
status = response.json()["status"] | |
print(status) | |
# keep checking the status until it succeeds | |
while status == "pending": | |
# wait one minute before checking again | |
time.sleep(60) | |
response = client.get(f"projects/{PROJECT_ID}/sidekick/") | |
status = response.json()["status"] | |
print(status) | |
print("Done!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Run this script first | |
It will upload all of your documents into the project | |
""" | |
from client import NEWSGROUPS, PROJECT_ID, client | |
print("Starting...") | |
print("Uploading...") | |
for newsgroup in NEWSGROUPS: | |
print(f"Newsgroup {newsgroup}...") | |
client.documents.upload_directory( | |
newsgroup, | |
# my documents have no extension | |
# It will default to pdf | |
extension=None, | |
# we set a data value so we know which folder each document came from | |
# if your documents are not already categorized you may skip this | |
data={"newsgroup": newsgroup}, | |
project=PROJECT_ID, | |
# I have text files, so I set the original extension to txt | |
# even though the files are not named with a txt extension | |
original_extension="txt", | |
handle_errors=True, | |
) | |
print("Done!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment