Skip to content

Instantly share code, notes, and snippets.

@ujiro99
Created May 18, 2013 06:50
Show Gist options
  • Save ujiro99/5603517 to your computer and use it in GitHub Desktop.
Save ujiro99/5603517 to your computer and use it in GitHub Desktop.
Node.jsで、取得したWebページをUTF-8へ自動変換する ref: http://qiita.com/items/648a9400e8b25d717b1e
{Iconv} = require "iconv"
{Buffer} = require "buffer"
# エンコーディングの判別
getCharset = (body) ->
bin = body.toString('binary')
re = bin.match(/<meta\b[^>]*charset=["']?([\w\-]+)/i)
if re
charset = re[1]
else
charset = "utf-8"
charset
# エンコーディングをUTF-8へ変換
toUtf8 = (body) ->
iconv = new Iconv(getCharset(body), 'UTF-8//TRANSLIT//IGNORE')
body = new Buffer(body, 'binary')
body = iconv.convert(body).toString()
module.exports =
toUtf8: toUtf8
request = require "request"
cheerio = require "cheerio"
charset = require "charset"
# 指定URLを処理する
processPage = (url) ->
request
uri: url
encoding: "binary"
, scrapingPage
# 取得したページをスクレイピングする
scrapingPage = (error, response, body) ->
if not error and response.statusCode is 200
# UTF-8へ変換
$ = cheerio.load(charset.toUtf8(body))
#
# 好きなスクレイピング ...
#
# 実行
processPage('http://www.jma.go.jp/jma/index.html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment