Created
December 14, 2013 05:22
-
-
Save kurehajime/7955959 to your computer and use it in GitHub Desktop.
はてブのホッテントリのタイトルを要約してWebの今を見つめる ref: http://qiita.com/kurehajime/items/f1cfd74e1ec7b45fbafa
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
</head> | |
<body style=""> | |
<p> </p> | |
<p> | |
<meta charset="UTF-8"> | |
<title>要約くん</title> | |
<link rel="stylesheet" href="http://code.jquery.com/mobile/1.1.0/jquery.mobile-1.1.0.min.css" /> | |
<script type="text/javascript" src="http://code.jquery.com/jquery-1.7.1.min.js"></script> | |
<script type="text/javascript" src="http://code.jquery.com/mobile/1.1.0/jquery.mobile-1.1.0.min.js"></script> | |
<script type="text/javascript" src="jscss/tiny_segmenter-0.1.js" charset="UTF-8"> | |
</script> <script type="text/javascript"> | |
var segmenter | |
$(function(){ | |
segmenter = new TinySegmenter();// インスタンス生成 | |
}) | |
//実行 | |
function doAction(){ | |
var wkIn=$("#txtIN").val()//インプット | |
var segs = segmenter.segment(wkIn); // 単語の配列が返る | |
var dict=makeDic(wkIn) | |
var wkbest=doShuffle(dict); | |
for(var i=0;i<=10;i++){ | |
wkOut=doShuffle(dict).replace(/\n/g,""); | |
if(Math.abs(40-wkOut.length)<Math.abs(40-wkbest.length)){ | |
wkbest=wkOut | |
} | |
} | |
$("#txtOUT").val(wkbest);//アウトプット | |
} | |
//文章をシャッフル | |
function doShuffle(wkDic){ | |
var wkNowWord="" | |
var wkStr="" | |
wkNowWord=wkDic["_BOS_"][Math.floor( Math.random() * wkDic["_BOS_"].length )]; | |
wkStr+=wkNowWord; | |
while(wkNowWord != "_EOS_"){ | |
wkNowWord=wkDic[wkNowWord][Math.floor( Math.random() * wkDic[wkNowWord].length )]; | |
wkStr+=wkNowWord; | |
} | |
wkStr=wkStr.replace(/_EOS_$/,"。") | |
return wkStr; | |
} | |
//辞書に追加 | |
function makeDic(wkStr){ | |
wkStr=nonoise(wkStr); | |
var wkLines= wkStr.split("。"); | |
var wkDict=new Object(); | |
for(var i =0;i<=wkLines.length-1;i++){ | |
var wkWords=segmenter.segment(wkLines[i]); | |
if(! wkDict["_BOS_"] ){wkDict["_BOS_"]=new Array();} | |
if(wkWords[0]){wkDict["_BOS_"].push(wkWords[0])};//文頭 | |
for(var w=0;w<=wkWords.length-1;w++){ | |
var wkNowWord=wkWords[w];//今の単語 | |
var wkNextWord=wkWords[w+1];//次の単語 | |
if(wkNextWord==undefined){//文末 | |
wkNextWord="_EOS_" | |
} | |
if(! wkDict[wkNowWord] ){ | |
wkDict[wkNowWord]=new Array(); | |
} | |
wkDict[wkNowWord].push(wkNextWord); | |
if(wkNowWord=="、"){//「、」は文頭として使える。 | |
wkDict["_BOS_"].push(wkNextWord); | |
} | |
} | |
} | |
return wkDict; | |
} | |
//ノイズ除去 | |
function nonoise(wkStr){ | |
wkStr=wkStr.replace(/\n/g,"。"); | |
wkStr=wkStr.replace(/[\?\!?!]/g,"。"); | |
wkStr=wkStr.replace(/[-||::・]/g,"。"); | |
wkStr=wkStr.replace(/[「」()\(\)\[\]【】]/g," "); | |
return wkStr; | |
} | |
</script> </meta> | |
<div data-role="page" id="first"> | |
<div data-role="content"> | |
<p>今ネット上で話題の記事を一行で要約すると・・・</p> | |
<p><textarea cols="60" rows="8" name="txtIN" id="txtIN" style="max-height:200px;">{{ mes }}</textarea></p> | |
<input type="button" name="" value="生成" onClick=" doAction()"></br> | |
<textarea cols="60" rows="8" name="txtIN" id="txtOUT"></textarea> | |
<p></p> | |
</div> | |
</div> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import webapp2 | |
import os | |
from google.appengine.ext.webapp import template | |
from xml.etree.ElementTree import * | |
import re | |
import urllib | |
class Markov(webapp2.RequestHandler): | |
def get(self): | |
mes="" | |
if self.request.get('mode')=="2ch": | |
mes=self.get_2ch() | |
else: | |
mes=self.get_hotentry_title() | |
template_values={ | |
'mes':mes | |
} | |
path = os.path.join(os.path.dirname(__file__), 'html/markov.html') | |
self.response.out.write(template.render(path, template_values)) | |
def get_hotentry_title(self): | |
titles = "" | |
tree = parse(urllib.urlopen('http://feeds.feedburner.com/hatena/b/hotentry')) | |
for i in tree.findall('./{http://purl.org/rss/1.0/}item'): | |
titles+= re.sub("[-:|/|:].{1,30}$","",i.find('{http://purl.org/rss/1.0/}title').text) + "\n" | |
return titles | |
def get_2ch(self): | |
titles = "" | |
response = urllib.urlopen('http://engawa.2ch.net/poverty/subject.txt') | |
html = unicode(response.read(), "cp932", 'ignore').encode("utf-8") | |
for line in html.split("\n"): | |
if line != "": | |
titles+=re.sub("\(.*?\)$","",line.split("<>", 2)[1])+ "\n" | |
return titles | |
app = webapp2.WSGIApplication([ | |
('/markov.html', Markov) | |
], debug=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment