Created
January 9, 2020 16:07
-
-
Save halfak/cab222498fa18eeab0355eac8e201dad to your computer and use it in GitHub Desktop.
Define features for: number_of_female_pronouns, number_of_male_pronouns, prop_of_female_pronouns, total_pronouns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python | |
Python 3.5.1+ (default, Mar 30 2016, 22:46:26) | |
[GCC 5.3.1 20160330] on linux | |
Type "help", "copyright", "credits" or "license" for more information. | |
>>> # This revision references https://en.wikipedia.org/wiki/Ann_Bishop_(biologist) | |
>>> rev_id = 931384270 | |
>>> from revscoring.extractors import api | |
>>> from revscoring.features import wikitext | |
>>> import mwapi | |
>>> extractor = api.Extractor(mwapi.Session("https://en.wikipedia.org")) | |
>>> # We define some datasources for extracting pronouns | |
>>> male_pronouns = wikitext.revision.datasources.tokens_matching(r"\b(he|him|his)\b") | |
>>> female_pronouns = wikitext.revision.datasources.tokens_matching(r"\b(she|her|hers)\b") | |
>>> # We can check if we did things right by extracting them | |
>>> extractor.extract(rev_id, male_pronouns) | |
[Token('his', type='word')] | |
>>> extractor.extract(rev_id, female_pronouns) | |
[Token('She', type='word'), Token('her', type='word'), Token('Her', type='word'), Token('her', type='word'), Token('her', type='word'), Token('Her', type='word'), Token('she', type='word'), Token('She', type='word'), Token('her', type='word'), Token('Her', type='word'), Token('Her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('she', type='word'), Token('She', type='word'), Token('her', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('she', type='word'), Token('She', type='word'), Token('she', type='word'), Token('Her', type='word'), Token('she', type='word'), Token('She', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('Her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('she', type='word'), Token('she', type='word'), Token('her', type='word'), Token('she', type='word'), Token('She', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('She', type='word'), Token('her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('she', type='word'), Token('her', type='word'), Token('her', type='word'), Token('She', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('she', type='word'), Token('she', type='word'), Token('her', type='word'), Token('She', type='word'), Token('she', type='word'), Token('her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('Her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('She', type='word'), Token('she', type='word'), Token('her', type='word'), Token('Her', type='word'), Token('her', type='word'), Token('She', type='word'), Token('Her', type='word'), Token('she', type='word'), Token('Her', type='word'), Token('she', type='word'), Token('her', type='word'), Token('Her', type='word'), Token('she', type='word'), Token('She', type='word'), Token('Her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('she', type='word'), Token('her', type='word'), Token('her', type='word'), Token('her', type='word'), Token('she', type='word'), Token('She', type='word'), Token('She', type='word'), Token('She', type='word'), Token('her', type='word'), Token('she', type='word'), Token('Her', type='word'), Token('her', type='word')] | |
>>> # Ultimately, we want features -- not datastructures. So we want to wrap this in an aggregator | |
>>> from revscoring.features.meta import aggregators | |
>>> male_pronouns_count = aggregators.len(male_pronouns) | |
>>> extractor.extract(rev_id, male_pronouns_count) | |
1.0 | |
>>> female_pronouns_count = aggregators.len(female_pronouns) | |
>>> extractor.extract(rev_id, female_pronouns_count) | |
104.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment