Skip to content

Instantly share code, notes, and snippets.

@yuya-takeyama
Created November 7, 2010 09:07
Show Gist options
  • Save yuya-takeyama/666026 to your computer and use it in GitHub Desktop.
Save yuya-takeyama/666026 to your computer and use it in GitHub Desktop.
Script generates N-gram from csv and inserts to MySQL.
<?php
/**
* Script generates N-gram from csv and inserts to MySQL.
* The csv can be downloaded from JapanPost website.
* http://www.post.japanpost.jp/zipcode/download.html
*
* @author Yuya Takeyama
*/
require_once 'Text/Ngram.php';
// Configurations.
define('DB_HOST', 'localhost');
define('DB_USER', 'yourusername');
define('DB_PASS', 'yourpassword');
define('DB_SCHEMA', 'yourschema');
if (empty($argv[1]) || is_readable($argv[1]) === false) {
echo "Usage: {$argv[0]} [Path to address csv]", PHP_EOL;
exit;
} else {
$csv = $argv[1];
}
echo date('Y-m-d H:i:s'), "\tBEGIN", PHP_EOL;
$file = new SplFileObject($csv);
$file->setFlags(SplFileObject::DROP_NEW_LINE | SplFileObject::SKIP_EMPTY | SplFileObject::READ_CSV);
$db = new PDO('mysql:host=' . DB_HOST . ';dbname=' . DB_SCHEMA, DB_USER, DB_PASS);
$db->query('SET NAMES utf8');
$count_ok = 0;
$count_ng = 0;
foreach ($file as $key => $row) {
$row = array_map(function ($val) {
return mb_convert_encoding($val, 'UTF-8', 'SJIS-Win');
}, $row);
$zip = $row[2];
$name = "{$row[6]} {$row[7]} {$row[8]}";
$name_bigram = toBigramString($name);
$sql = 'INSERT INTO addresses (zip, name, name_bigram) VALUES (?, ?, ?)';
$stmt = $db->prepare($sql);
$stmt->bindValue(1, $zip, PDO::PARAM_INT);
$stmt->bindValue(2, $name, PDO::PARAM_STR);
$stmt->bindValue(3, $name_bigram, PDO::PARAM_STR);
$result = $stmt->execute();
if ($result === true) {
$count_ok += 1;
} else {
$count_ng += 1;
}
$count_all = $key + 1;
if ($count_all % 1000 === 0) {
echo date('Y-m-d H:i:s'), "\tProcessed {$count_all} rows.", PHP_EOL;
}
}
echo date('Y-m-d H:i:s'), "\tEND", PHP_EOL;
echo "Inserted successfully {$count_ok} rows.", PHP_EOL;
echo "Failed {$count_ng} rows.", PHP_EOL;
exit;
function toBigramString($str)
{
$ngram = new \Text\Ngram($str, 2, 'UTF-8');
return join(' ', $ngram->toArray());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment