Skip to content

Instantly share code, notes, and snippets.

@Hironsan
Created September 11, 2018 20:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Hironsan/f2bab8fabc7dd5967bee1c9648695120 to your computer and use it in GitHub Desktop.
Save Hironsan/f2bab8fabc7dd5967bee1c9648695120 to your computer and use it in GitHub Desktop.
chABSA-dataset corpus reader
# -*- coding: utf-8 -*-
import json
from collections import defaultdict
from pathlib import Path
def find_corpus_fileids(root, regexp):
p = Path(root)
return list(p.glob(regexp))
class ChabsaCorpusReader(object):
def __init__(self, root, regexp, encoding='utf-8'):
self._fileids = find_corpus_fileids(root, regexp)
self._encoding = encoding
self._docs = []
self._f2c = defaultdict(set)
self._c2f = defaultdict(set)
self._load()
def _load(self):
for fileid in self._fileids:
with open(fileid, encoding=self._encoding) as f:
j = json.load(f)
category = j['header']['category33']
self._f2c[fileid].add(category)
self._c2f[category].add(fileid)
self._docs.append(j)
def sents(self, categories=None):
for doc in self._docs:
category = doc['header']['category33']
if categories is None:
pass
elif category not in categories:
continue
for sent in doc['sentences']:
yield sent['sentence']
def tagged_sents(self, categories=None):
for doc in self._docs:
category = doc['header']['category33']
if categories is None:
pass
elif category not in categories:
continue
for sent in doc['sentences']:
text = sent['sentence']
opinions = sent['opinions']
aspects, polarities = self._get_tags(text, opinions)
yield tuple(zip(text, aspects, polarities))
def _get_tags(self, text, opinions):
aspects = ['O'] * len(text)
polarities = ['O'] * len(text)
for o in opinions:
aspect = o['category']
polarity = o['polarity']
for i in range(o['from'], o['to']):
prefix = 'B-{}' if i == o['from'] else 'I-{}'
aspects[i] = prefix.format(aspect)
polarities[i] = prefix.format(polarity)
return aspects, polarities
def categories(self, fileids=None):
if fileids is None:
return sorted(self._c2f)
return sorted(set.union(*[self._f2c[d] for d in fileids]))
def fileids(self, categories=None):
if categories is None:
return self._fileids
return sorted(set.union(*[self._c2f[c] for c in categories]))
if __name__ == '__main__':
chabsa = ChabsaCorpusReader('../data/chABSA-dataset/', '*.json')
print(list(chabsa.sents()))
print(list(chabsa.sents(categories=['サービス業'])))
print(list(chabsa.tagged_sents())[0])
print(chabsa.categories())
print(chabsa.fileids(["サービス業"]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment