Skip to content

Instantly share code, notes, and snippets.

@adibenc
Last active September 28, 2019 15:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adibenc/d2719aeec0ac2a878d8b683a8264666f to your computer and use it in GitHub Desktop.
Save adibenc/d2719aeec0ac2a878d8b683a8264666f to your computer and use it in GitHub Desktop.
java text get from wiktionary
#!/usr/bin/python
# crawl
from bs4 import BeautifulSoup as bs4
# import MySQLdb
# import os
# from Connection import *
def getlitext(f):
# f="p203.html"
text=open(f,'r').read()
bs = bs4(text,'html.parser')
div=bs.find_all("div",
{"class":'mw-category-generated'})
if len(div)>0:
uls= div[0].find_all('ul')
else:
uls=[]
out=""
ret=[]
# total=0
for ul in uls:
lis=ul.find_all('li')
for li in lis:
# print li.string
# out+=li.string.encode('utf-8').strip()+"\n"
# total+=1
k=li.string.encode('utf-8').strip()
ret.append(k)
return ret
# fout=f+'.txt'
# f=open(fout,'w')
# f.write(out)
# f.close()
# print "{} words to {} done".format(total,fout)
#data = list kata
"""
def insert(data=[],nfile=""):
# sesuaikan
sock = "/opt/lampp/var/mysql/mysql.sock"
db = MySQLdb.connect(host="localhost",
user="root",
passwd="",
db="db1",
unix_socket=sock)
# print db
fmt="(\"{}\")"
ins=[]
total=1
for k in data:
total+=1
ins.append(fmt.format(k))
ins= ",".join(ins)
# return
q="insert into javalist (teks) values {}".format(ins)
cursor = db.cursor()
ex=cursor.execute(q)
db.commit()
print "{} to db".format(nfile)
"""
# nfile="p203.html"
# katas=getlitext(nfile)
# insert(katas,nfile)
from os import listdir
from os.path import isfile, join
mypath="."
# get htmls
htmls = [f for f in listdir(mypath) if isfile(join(mypath, f)) and ('html' in join(mypath, f))]
# loop
allkata=[]
total=1
for h in htmls:
katas=getlitext(h)
for k in katas:
allkata.append(k)
total+=1
# insert(katas,h)
out="\n".join(allkata)
f="all"
fout=f+'.txt'
f=open(fout,'w')
f.write(out)
f.close()
print "{} words to {} done".format(total,fout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment