Skip to content

Instantly share code, notes, and snippets.

Created January 1, 2016 17:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/3f64be27604c1aa00fc2 to your computer and use it in GitHub Desktop.
Save anonymous/3f64be27604c1aa00fc2 to your computer and use it in GitHub Desktop.
from revscoring.datasources import revision
from revscoring.datasources import diff
from revscoring.extractors import APIExtractor
import mwapi
#File to dump data
DUMP_TARGET_NAME = "dump.txt"
dump_target = open(DUMP_TARGET_NAME, 'w')
#Select rev_ids to examine.
IDS_FILE_NAME = "data.tsv"
ids_file = open(IDS_FILE_NAME)
MAX_REVISIONS = 10
rev_ids = []
count = 0
for line in ids_file:
if count < MAX_REVISIONS:
rev_ids.append(int(line.strip().split("\t")[0]))
count += 1
else:
break
"""
Feature to examine. Let FEATURE be one of
diff.added_tokens
diff.removed_tokens
diff.added_segments
diff.removed_segments
revision.content
revision.content_tokens
"""
FEATURE = revision.content
#Extract data from selected revisions and write to selected file
extr = APIExtractor(mwapi.Session("https://en.wikipedia.org"))
for id in rev_ids:
data = extr.extract(id, FEATURE)
dump_target.write("\n\nBeginning %s of revision %d\n\n" % (FEATURE, id))
if type(data) is str:
dump_target.write(data)
elif type(data) is list:
dump_target.writelines(data)
else:
print("Unknown Type")
exit()
dump_target.write("\n\nEnd %s of revision %d" % (FEATURE, id))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment