This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static final Pattern PAT_JAPANESE_CHARACTER = Pattern | |
.compile("[\\p{IsHiragana}\\p{IsKatakana}\\p{InCJKUnifiedIdeographs}]"); | |
private static boolean isJapanese(final String token) { | |
return PAT_JAPANESE_CHARACTER.matcher(token).find(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Lexical Density | |
http://web.archive.org/web/20110810174351/http://www.unisanet.unisa.edu.au/Resources/la/Readability/Content%20words%20and%20lexical%20density.htm | |
""" | |
from __future__ import division | |
import MeCab | |
CONTENT_WORD_POS = ('名詞', '動詞', '形容詞', '副詞') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"401(K)s": ["Finance", "Investing", "Retirement Investments", "401(K)s"], | |
"Accommodations": ["Travel & Tourism", "Accommodations"], | |
"Accounting & Auditing": ["Finance", "Accounting & Auditing"], | |
"Acne": ["Health", "Health Conditions & Concerns", "Skin Conditions & Skin Health", "Acne"], | |
"Air Travel": ["Travel & Tourism", "Air Travel"], | |
"Airline Tickets, Fares & Flights": ["Travel & Tourism", "Air Travel", "Airline Tickets, Fares & Flights"], | |
"Alternative & Natural Medicine": ["Health", "Health Care Services", "Alternative & Natural Medicine"], | |
"Anti-Aging": ["Beauty & Personal Care", "Anti-Aging"], | |
"Anti-Virus Software": ["Computers", "Software", "Internet Software & Web Goodies", "Network Security Software", "Anti-Virus Software"], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import re | |
from encodings.aliases import aliases | |
import nkf | |
import tornado | |
from tornado import httpclient, gen | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CHINESE_MAP = {'1': '一', '2': '二', '3': '三', '4': '四', '5': '五', '6': '六', '7': '七', '8': '八', '9': '九'} | |
CHINESE_DIGITS = ('十', '百', '千', '万', '十万', '百万', '千万', '億', '十億', '百億', '千億', '兆', '十兆', '百兆', '千兆') | |
def arabic2chinese(arabic): | |
chinese = [] | |
if len(arabic) == '0': | |
return '〇' | |
arabic = arabic.replace(',', '') | |
for (i, num) in enumerate(arabic[::-1]): | |
if num == '0': |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
MAX_DIGIT = 10 # 最大桁数(intなら10桁) | |
NUMS = ['0','1','2','3','4','5','6','7','8','9'] | |
def itoa(num): | |
result = '' | |
previous = 0 | |
for i in range(MAX_DIGIT,-1,-1): | |
temp = num / (10**i) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib, re, os | |
from BeautifulSoup import BeautifulSoup | |
urls = (\ | |
'http://search.biglobe.ne.jp/rss/ranking.xml',\ | |
'http://trackword.rssfeed.cc/index.xml',\ | |
'http://www.jtb.co.jp/ranking/keyword/rss.aspx',\ | |
'http://www.nilab.info/buzztube/buzztube.xml',\ | |
'http://ranking.goo.ne.jp/rss/keyword/keyrank_all1/index.rdf',\ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pushd . | |
# Install blas | |
cd /tmp | |
wget http://www.netlib.org/blas/blas.tgz | |
tar xzf blas.tgz | |
cd BLAS* | |
gfortran -O3 -m64 -fPIC -c *.f | |
ar r libfblas.a *.o | |
ranlib libfblas.a |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* 1500ったーのCSVから余分な情報をカット | |
* | |
* RTを含むツイート、行頭の1500ったーのナンバーとユーザー名と投稿日、行末の引用元URL | |
* 文中のURL、返信宛先(@fooとか)をカット | |
* | |
* USAGE | |
* このスクリプトと同じディレクトリに1500ったーのCSVを置き | |
* コマンドプロンプトやターミナルから以下のとおり入力 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Check the number of parameters | |
if [ $# -ne 4 ]; then | |
echo "usage: ./cv.sh [FILE] [division number] [train command] [test command]" 1>&2 | |
echo "The file for training is named as trainfile" 1>&2 | |
echo "The file for testing is named as testfile" 1>&2 | |
echo "for example:" 1>&2 | |
echo './cv.sh data 5 "opal trainfile train.model -" "opal - train.model testfile"' 1>&2 | |
exit 1 |