Created
September 11, 2018 20:46
-
-
Save Hironsan/f2bab8fabc7dd5967bee1c9648695120 to your computer and use it in GitHub Desktop.
chABSA-dataset corpus reader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import json | |
from collections import defaultdict | |
from pathlib import Path | |
def find_corpus_fileids(root, regexp): | |
p = Path(root) | |
return list(p.glob(regexp)) | |
class ChabsaCorpusReader(object): | |
def __init__(self, root, regexp, encoding='utf-8'): | |
self._fileids = find_corpus_fileids(root, regexp) | |
self._encoding = encoding | |
self._docs = [] | |
self._f2c = defaultdict(set) | |
self._c2f = defaultdict(set) | |
self._load() | |
def _load(self): | |
for fileid in self._fileids: | |
with open(fileid, encoding=self._encoding) as f: | |
j = json.load(f) | |
category = j['header']['category33'] | |
self._f2c[fileid].add(category) | |
self._c2f[category].add(fileid) | |
self._docs.append(j) | |
def sents(self, categories=None): | |
for doc in self._docs: | |
category = doc['header']['category33'] | |
if categories is None: | |
pass | |
elif category not in categories: | |
continue | |
for sent in doc['sentences']: | |
yield sent['sentence'] | |
def tagged_sents(self, categories=None): | |
for doc in self._docs: | |
category = doc['header']['category33'] | |
if categories is None: | |
pass | |
elif category not in categories: | |
continue | |
for sent in doc['sentences']: | |
text = sent['sentence'] | |
opinions = sent['opinions'] | |
aspects, polarities = self._get_tags(text, opinions) | |
yield tuple(zip(text, aspects, polarities)) | |
def _get_tags(self, text, opinions): | |
aspects = ['O'] * len(text) | |
polarities = ['O'] * len(text) | |
for o in opinions: | |
aspect = o['category'] | |
polarity = o['polarity'] | |
for i in range(o['from'], o['to']): | |
prefix = 'B-{}' if i == o['from'] else 'I-{}' | |
aspects[i] = prefix.format(aspect) | |
polarities[i] = prefix.format(polarity) | |
return aspects, polarities | |
def categories(self, fileids=None): | |
if fileids is None: | |
return sorted(self._c2f) | |
return sorted(set.union(*[self._f2c[d] for d in fileids])) | |
def fileids(self, categories=None): | |
if categories is None: | |
return self._fileids | |
return sorted(set.union(*[self._c2f[c] for c in categories])) | |
if __name__ == '__main__': | |
chabsa = ChabsaCorpusReader('../data/chABSA-dataset/', '*.json') | |
print(list(chabsa.sents())) | |
print(list(chabsa.sents(categories=['サービス業']))) | |
print(list(chabsa.tagged_sents())[0]) | |
print(chabsa.categories()) | |
print(chabsa.fileids(["サービス業"])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment