Skip to content

Instantly share code, notes, and snippets.

View computingfreak's full-sized avatar

Milind Shah computingfreak

View GitHub Profile
import urllib2
url="http://www.rakuten.co.jp"
page=urllib2.urlopen(url)
data=page.read().split("</a>")
tag="<a href=\""
endtag="\">"
for iter, item in enumerate(data):
if "<a href" in item:
try:
ind = item.index(tag)
import urllib2
import re
url = "https://www.rakuten.co.jp"
page = urllib2.urlopen(url)
page = page.read()
links = re.findall(r"<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>", page)
for i, link in enumerate(links):
print('%d href: %s' % (i, link[0]))
# -*- coding: utf-8 -*-
import polyglot
from polyglot.text import Text, Word
zen = Text("色は匂へど 散りぬるを"
"我が世誰ぞ 常ならむ"
"有為の奥山 今日越えて"
"浅き夢見じ 酔ひもせず")
print(zen.words)
for temp in zen.words:
print(temp.encode("utf-8"))
# -*- coding: utf-8 -*-
import polyglot
from polyglot.text import Text, Word
zen = Text("色は匂へど 散りぬるを"
"我が世誰ぞ 常ならむ"
"有為の奥山 今日越えて"
"浅き夢見じ 酔ひもせず")
print("Language Detected: Code={}, Name={}\n".format(zen.language.code, zen.language.name))
print(zen.words)
for temp in zen.words:
# -*- coding: utf-8 -*-
from __future__ import print_function
import tinysegmenter
segmenter = tinysegmenter.TinySegmenter()
#name = raw_input('Enter a Japanese sentence:\n')
#print(name)
#msg = name.decode("utf-8")
msg = u"日本語の自然言語処理は本当にしんどい。"
print(msg)
tokens = segmenter.tokenize(msg)
#brew install mecab
#brew install mecab-ipadic
#pip install mecab-python3
from natto import MeCab
def tokenize(text):
tokens = []
with MeCab('-F%f[0],%f[6]') as nm:
for n in nm.parse(text, as_nodes=True):
import MeCab
import sys
m = MeCab.Tagger("-Owakati")
f = open('test.txt','r')
text = f.read().decode('utf-8')
f.close()
g = open('temp.txt','w')
g.write(m.parse(text.encode('utf-8')))
g.close()
# -*- coding: utf-8 -*-
from __future__ import print_function
from natto import MeCab
def tokenize(text):
tokens = []
with MeCab('-F%f[0],%f[6]') as nm:
for n in nm.parse(text, as_nodes=True):
# ignore any end-of-sentence nodes
# -*- coding: utf-8 -*-
from __future__ import print_function
from natto import MeCab
def testeng():
words = []
#Mecab.Tagger ("-Ochasen")##茶筌
#Mecab.Tagger ("-Owakati")##分かち書き
text="日本語の自然言語処理は本当にしんどい。facebookとtwitterで働いています"
with MeCab() as nm:
# -*- coding: utf-8 -*-
from __future__ import print_function
from natto import MeCab
def tokenize(text):
words = []
#Mecab.Tagger ("-Ochasen")##茶筌
#Mecab.Tagger ("-Owakati")##分かち書き
#with MeCab('-F%f[0],%f[6]') as nm: