Skip to content

Instantly share code, notes, and snippets.

@GINK03
Created April 3, 2017 09:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GINK03/28630674405219fe152fb4d5e92557cb to your computer and use it in GitHub Desktop.
Save GINK03/28630674405219fe152fb4d5e92557cb to your computer and use it in GitHub Desktop.
char level 絵文字を意識したfasttextによるベクトル化
DATA_SIZE = 300000
def _make_char_index():
char_index = {}
for char in open('./char_level.txt', 'r').read().replace('\n', ' ').split():
emoji = re.compile(u'['
u'\U0001F300-\U0001F5FF'
u'\U0001F600-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
if re.match(emoji, char) is not None:
continue
if char_index.get(char) is not None:
continue
char_index[char] = len(char_index)
print(char)
open('char_index.pkl', 'wb').write(pickle.dumps(char_index))
sys.exit()
def make_char():
os.system('rm -rf ./dataset')
os.system('mkdir dataset')
f = open('char_level.txt', 'w')
for ni, name in enumerate(glob.glob('../out20170325/*')):
if ni%10000 == 0:
print("iter %d"%ni, file=sys.stderr)
if ni > DATA_SIZE : break
try:
obj = json.loads(open(name).read())
except:
continue
text = obj['txt']
emoji = re.compile(u'['
u'\U0001F300-\U0001F5FF'
u'\U0001F600-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
if re.search(emoji, text) is None:
continue
emojis = re.findall(emoji, text)
for p in [r' ', r'\n', r' ', emoji]:
no_emoji = re.sub(p, '', text)
emojis.extend(list(no_emoji))
f.write("%s\n"%' '.join(emojis))
# ./fasttext skipgram -input char_level.txt -output model -dim 256 -minCount 1
os.system("./fasttext skipgram -input char_level.txt -output model -dim 256 -minCount 1")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment