Created
November 7, 2010 09:07
-
-
Save yuya-takeyama/666026 to your computer and use it in GitHub Desktop.
Script generates N-gram from csv and inserts to MySQL.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Script generates N-gram from csv and inserts to MySQL. | |
* The csv can be downloaded from JapanPost website. | |
* http://www.post.japanpost.jp/zipcode/download.html | |
* | |
* @author Yuya Takeyama | |
*/ | |
require_once 'Text/Ngram.php'; | |
// Configurations. | |
define('DB_HOST', 'localhost'); | |
define('DB_USER', 'yourusername'); | |
define('DB_PASS', 'yourpassword'); | |
define('DB_SCHEMA', 'yourschema'); | |
if (empty($argv[1]) || is_readable($argv[1]) === false) { | |
echo "Usage: {$argv[0]} [Path to address csv]", PHP_EOL; | |
exit; | |
} else { | |
$csv = $argv[1]; | |
} | |
echo date('Y-m-d H:i:s'), "\tBEGIN", PHP_EOL; | |
$file = new SplFileObject($csv); | |
$file->setFlags(SplFileObject::DROP_NEW_LINE | SplFileObject::SKIP_EMPTY | SplFileObject::READ_CSV); | |
$db = new PDO('mysql:host=' . DB_HOST . ';dbname=' . DB_SCHEMA, DB_USER, DB_PASS); | |
$db->query('SET NAMES utf8'); | |
$count_ok = 0; | |
$count_ng = 0; | |
foreach ($file as $key => $row) { | |
$row = array_map(function ($val) { | |
return mb_convert_encoding($val, 'UTF-8', 'SJIS-Win'); | |
}, $row); | |
$zip = $row[2]; | |
$name = "{$row[6]} {$row[7]} {$row[8]}"; | |
$name_bigram = toBigramString($name); | |
$sql = 'INSERT INTO addresses (zip, name, name_bigram) VALUES (?, ?, ?)'; | |
$stmt = $db->prepare($sql); | |
$stmt->bindValue(1, $zip, PDO::PARAM_INT); | |
$stmt->bindValue(2, $name, PDO::PARAM_STR); | |
$stmt->bindValue(3, $name_bigram, PDO::PARAM_STR); | |
$result = $stmt->execute(); | |
if ($result === true) { | |
$count_ok += 1; | |
} else { | |
$count_ng += 1; | |
} | |
$count_all = $key + 1; | |
if ($count_all % 1000 === 0) { | |
echo date('Y-m-d H:i:s'), "\tProcessed {$count_all} rows.", PHP_EOL; | |
} | |
} | |
echo date('Y-m-d H:i:s'), "\tEND", PHP_EOL; | |
echo "Inserted successfully {$count_ok} rows.", PHP_EOL; | |
echo "Failed {$count_ng} rows.", PHP_EOL; | |
exit; | |
function toBigramString($str) | |
{ | |
$ngram = new \Text\Ngram($str, 2, 'UTF-8'); | |
return join(' ', $ngram->toArray()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment