Skip to content

Instantly share code, notes, and snippets.

View tomonari-masada's full-sized avatar

Tomonari MASADA tomonari-masada

View GitHub Profile
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from pyknp import Juman
import torch
from pytorch_transformers import *
config = BertConfig.from_json_file('Japanese_L-12_H-768_A-12_E-30_BPE/bert_config.json')
model = BertForMaskedLM.from_pretrained('Japanese_L-12_H-768_A-12_E-30_BPE/pytorch_model.bin',
config=config)
tokenizer = BertTokenizer('Japanese_L-12_H-768_A-12_E-30_BPE/vocab.txt',
do_lower_case=False, do_basic_tokenize=False)
@tomonari-masada
tomonari-masada / packed_sequences.ipynb
Created July 26, 2018 14:30
packed_sequences.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@tomonari-masada
tomonari-masada / dblp_parse.py
Created July 25, 2018 08:09
A Python parser for dblp.xml
# -*- coding: utf-8 -*-
from lxml import etree
import os
import sys
from io import TextIOWrapper
from nltk.tokenize import RegexpTokenizer
#
# USAGE:
#
@tomonari-masada
tomonari-masada / avb_gmm.py
Created November 27, 2017 11:48
Adversarial variational Bayes for univariate Gaussian mixture models
import sys
import torch
import torch.nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
@tomonari-masada
tomonari-masada / spiral.py
Created August 28, 2017 12:28
spiral data classification
import io, sys, math, random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn
from torch.autograd import Variable
from torch import optim
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
import torch
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
torch.manual_seed(102)
np.random.seed(22)
fig = plt.figure()
@tomonari-masada
tomonari-masada / use_mecab.py
Last active April 11, 2017 03:21
How to use MeCab in Python3
import sys
import io
import MeCab
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
m = MeCab.Tagger()
fp = open('a1.txt', encoding='utf8')
wseq = list()
for line in fp:
for ol in m.parse(line.strip()).split('\n'):
if len(ol.split()) > 1:
@tomonari-masada
tomonari-masada / glow500.py
Created April 3, 2017 07:50
Reproduce Table 2.2 of Applied Logistic Regression (3rd Edition) with Statsmodels
import pandas as pd
import statsmodels.api as sm
# glow500.xls at https://www.umass.edu/statdata/statdata/data/glow/index.html
xls_file = pd.ExcelFile('glow500.xls')
df = xls_file.parse(header=0)
rate_dummies = pd.get_dummies(df['RATERISK'])
rate_dummies.columns = ['RATERISK1', 'RATERISK2', 'RATERISK3']
@tomonari-masada
tomonari-masada / maximal_substrings.c
Created February 28, 2017 08:47
extracting maximal substrings from UTF8 Japanese strings
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#define SEPCHAR '_'
#define MAXLEN 32
#define BUFFSIZE 1000000
#define TOKENLEN 8