-
-
Save yfe404/4378f5c0e5be902d93be522ddc247e6b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from revscoring.features import wikitext | |
from revscoring.languages import english | |
char_based = [ | |
wikitext.revision.chars, | |
wikitext.revision.whitespace_chars, | |
wikitext.revision.markup_chars, | |
wikitext.revision.cjk_chars, | |
wikitext.revision.entity_chars, | |
wikitext.revision.url_chars, | |
wikitext.revision.word_chars, | |
wikitext.revision.uppercase_word_chars, | |
wikitext.revision.punctuation_chars, | |
wikitext.revision.break_chars, | |
wikitext.revision.longest_repeated_char | |
] | |
token_based = [ | |
wikitext.revision.tokens, | |
wikitext.revision.numbers, | |
wikitext.revision.whitespaces, | |
wikitext.revision.markups, | |
wikitext.revision.cjks, | |
wikitext.revision.entities, | |
wikitext.revision.urls, | |
wikitext.revision.words, | |
wikitext.revision.uppercase_words, | |
wikitext.revision.punctuations, | |
wikitext.revision.breaks, | |
wikitext.revision.longest_token, | |
wikitext.revision.longest_word | |
] | |
lang_based = [ | |
english.badwords.revision.matches, | |
english.stopwords.revision.stopwords, | |
english.stopwords.revision.non_stopwords, | |
english.informals.revision.matches | |
] | |
parse_based = [ | |
wikitext.revision.content_chars, | |
wikitext.revision.headings, | |
wikitext.revision.external_links, | |
wikitext.revision.wikilinks, | |
wikitext.revision.tags, | |
wikitext.revision.ref_tags, | |
wikitext.revision.templates | |
] | |
char_normalized = [ | |
wikitext.revision.whitespace_chars / wikitext.revision.chars, | |
wikitext.revision.markup_chars / wikitext.revision.chars, | |
wikitext.revision.cjk_chars / wikitext.revision.chars, | |
wikitext.revision.entity_chars / wikitext.revision.chars, | |
wikitext.revision.url_chars / wikitext.revision.chars, | |
wikitext.revision.word_chars / wikitext.revision.chars, | |
wikitext.revision.uppercase_word_chars / wikitext.revision.chars, | |
wikitext.revision.punctuation_chars / wikitext.revision.chars, | |
wikitext.revision.break_chars / wikitext.revision.chars, | |
wikitext.revision.longest_repeated_char / wikitext.revision.chars, | |
] | |
token_normalized = [ | |
wikitext.revision.numbers / wikitext.revision.tokens, | |
wikitext.revision.whitespaces / wikitext.revision.tokens, | |
wikitext.revision.markups / wikitext.revision.tokens, | |
wikitext.revision.cjks / wikitext.revision.tokens, | |
wikitext.revision.entities / wikitext.revision.tokens, | |
wikitext.revision.urls / wikitext.revision.tokens, | |
wikitext.revision.words / wikitext.revision.tokens, | |
wikitext.revision.uppercase_words / wikitext.revision.tokens, | |
wikitext.revision.punctuations / wikitext.revision.tokens, | |
wikitext.revision.breaks / wikitext.revision.tokens, | |
wikitext.revision.longest_token / wikitext.revision.tokens, | |
wikitext.revision.longest_word / wikitext.revision.tokens | |
] | |
lang_normalized = [ | |
english.badwords.revision.matches / wikitext.revision.words, | |
english.stopwords.revision.stopwords / wikitext.revision.words, | |
english.stopwords.revision.non_stopwords / wikitext.revision.words, | |
english.informals.revision.matches / wikitext.revision.words | |
] | |
draft_quality = (char_based + token_based + parse_based + | |
lang_based + char_normalized + token_normalized + lang_normalized) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from pprint import pprint | |
>>> | |
>>> from revscoring.datasources import revision_oriented as ro | |
>>> from revscoring.dependencies import solve | |
>>> | |
>>> | |
>>> text = """ | |
... {{Infobox|foo}} | |
... I am an article | |
... | |
... == Header! == | |
... * list | |
... * items | |
... * [[stuff|and a link]]<ref>some stuff</ref> | |
... """ | |
>>> pprint(list(zip(draft_quality, solve(draft_quality, cache={ro.revision.text: text})))) | |
[(<feature.wikitext.revision.chars>, 139), | |
(<feature.wikitext.revision.whitespace_chars>, 27), | |
(<feature.wikitext.revision.markup_chars>, 12), | |
(<feature.wikitext.revision.cjk_chars>, 0), | |
(<feature.wikitext.revision.entity_chars>, 0), | |
(<feature.wikitext.revision.url_chars>, 0), | |
(<feature.wikitext.revision.word_chars>, 59), | |
(<feature.wikitext.revision.uppercase_word_chars>, 0), | |
(<feature.wikitext.revision.punctuation_chars>, 25), | |
(<feature.wikitext.revision.break_chars>, 0), | |
(<feature.wikitext.revision.longest_repeated_char>, 3), | |
(<feature.len(<datasource.tokenized(datasource.revision.text)>)>, 64), | |
(<feature.len(<datasource.wikitext.revision.numbers>)>, 0), | |
(<feature.len(<datasource.wikitext.revision.whitespaces>)>, 27), | |
(<feature.len(<datasource.wikitext.revision.markups>)>, 6), | |
(<feature.len(<datasource.wikitext.revision.cjks>)>, 0), | |
(<feature.len(<datasource.wikitext.revision.entities>)>, 0), | |
(<feature.len(<datasource.wikitext.revision.urls>)>, 0), | |
(<feature.len(<datasource.wikitext.revision.words>)>, 15), | |
(<feature.len(<datasource.wikitext.revision.uppercase_words>)>, 0), | |
(<feature.len(<datasource.wikitext.revision.punctuations>)>, 9), | |
(<feature.len(<datasource.wikitext.revision.breaks>)>, 0), | |
(<feature.max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>)>, | |
7), | |
(<feature.max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>)>, | |
7), | |
(<feature.wikitext.revision.content_chars>, 102), | |
(<feature.wikitext.revision.headings>, 0), | |
(<feature.wikitext.revision.external_links>, 0), | |
(<feature.wikitext.revision.wikilinks>, 1), | |
(<feature.wikitext.revision.tags>, 1), | |
(<feature.wikitext.revision.ref_tags>, 1), | |
(<feature.wikitext.revision.templates>, 1), | |
(<feature.len(<datasource.english.badwords.revision.matches>)>, 0), | |
(<feature.len(<datasource.english.stopwords.revision.stopwords>)>, 6), | |
(<feature.len(<datasource.english.stopwords.revision.non_stopwords>)>, 9), | |
(<feature.len(<datasource.english.informals.revision.matches>)>, 1), | |
(<feature.(wikitext.revision.whitespace_chars / wikitext.revision.chars)>, | |
0.19424460431654678), | |
(<feature.(wikitext.revision.markup_chars / wikitext.revision.chars)>, | |
0.08633093525179857), | |
(<feature.(wikitext.revision.cjk_chars / wikitext.revision.chars)>, 0.0), | |
(<feature.(wikitext.revision.entity_chars / wikitext.revision.chars)>, 0.0), | |
(<feature.(wikitext.revision.url_chars / wikitext.revision.chars)>, 0.0), | |
(<feature.(wikitext.revision.word_chars / wikitext.revision.chars)>, | |
0.4244604316546763), | |
(<feature.(wikitext.revision.uppercase_word_chars / wikitext.revision.chars)>, | |
0.0), | |
(<feature.(wikitext.revision.punctuation_chars / wikitext.revision.chars)>, | |
0.17985611510791366), | |
(<feature.(wikitext.revision.break_chars / wikitext.revision.chars)>, 0.0), | |
(<feature.(wikitext.revision.longest_repeated_char / wikitext.revision.chars)>, | |
0.02158273381294964), | |
(<feature.(len(<datasource.wikitext.revision.numbers>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.0), | |
(<feature.(len(<datasource.wikitext.revision.whitespaces>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.421875), | |
(<feature.(len(<datasource.wikitext.revision.markups>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.09375), | |
(<feature.(len(<datasource.wikitext.revision.cjks>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.0), | |
(<feature.(len(<datasource.wikitext.revision.entities>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.0), | |
(<feature.(len(<datasource.wikitext.revision.urls>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.0), | |
(<feature.(len(<datasource.wikitext.revision.words>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.234375), | |
(<feature.(len(<datasource.wikitext.revision.uppercase_words>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.0), | |
(<feature.(len(<datasource.wikitext.revision.punctuations>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.140625), | |
(<feature.(len(<datasource.wikitext.revision.breaks>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.0), | |
(<feature.(max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.109375), | |
(<feature.(max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
0.109375), | |
(<feature.(len(<datasource.english.badwords.revision.matches>) / len(<datasource.wikitext.revision.words>))>, | |
0.0), | |
(<feature.(len(<datasource.english.stopwords.revision.stopwords>) / len(<datasource.wikitext.revision.words>))>, | |
0.4), | |
(<feature.(len(<datasource.english.stopwords.revision.non_stopwords>) / len(<datasource.wikitext.revision.words>))>, | |
0.6), | |
(<feature.(len(<datasource.english.informals.revision.matches>) / len(<datasource.wikitext.revision.words>))>, | |
0.06666666666666667)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment