Skip to content

Instantly share code, notes, and snippets.

@igorw
Last active October 7, 2015 13:43
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save igorw/ba5e4c9562d19b89ef56 to your computer and use it in GitHub Desktop.
Save igorw/ba5e4c9562d19b89ef56 to your computer and use it in GitHub Desktop.
<?php
$n = 6;
$rules = [];
$ngrams = [];
foreach (read_csv('tweets.csv') as $data) {
$source = $data['text'];
$source = htmlspecialchars_decode($source);
$source = preg_replace('#http.*(…|\s|$)#', ' ', $source);
$source = preg_replace('/\s+/', ' ', $source);
$source = trim($source);
if ($source === '' || strlen($source) < 2*$n) {
continue;
}
foreach (range($n, strlen($source)-4) as $i) {
$ngram_a = substr($source, $i - $n, $n);
$ngram_b = substr($source, $i, $n);
if (!isset($rules[$ngram_a][$ngram_b])) {
$rules[$ngram_a][$ngram_b] = 0;
}
$rules[$ngram_a][$ngram_b]++;
}
$ngrams[] = substr($source, 0, $n);
foreach (str_split($source) as $i => $char) {
$ngram = substr($source, $i + 1, $n);
if ($char === ' ' && strlen($ngram) === $n) {
$ngrams[] = substr($source, $i + 1, $n);
}
}
}
// a few passes of removing states that do not loop back
// because we don't want to implement mark and sweep-ish
// ugh, this is super slow
foreach (range(1, 3) as $i) {
foreach ($rules as $state => $new_states) {
foreach ($new_states as $ngram => $weight) {
if (!isset($rules[$ngram])) {
unset($rules[$state][$ngram]);
}
}
if (count($rules[$state]) === 0) {
unset($rules[$state]);
}
}
}
while (true) {
$state = array_pick($ngrams);
foreach (range(1, 50) as $i) {
echo $state;
if (!isset($rules[$state])) {
// no transition found for state
break;
}
$state = weighted_pick($rules[$state]);
}
echo "\n\n";
sleep(2);
}
function array_pick($values) {
return $values[array_rand($values)];
}
function weighted_pick(array $weighted_values) {
$rand = mt_rand(1, (int) array_sum($weighted_values));
foreach ($weighted_values as $ngram => $weight) {
$rand -= $weight;
if ($rand <= 0) {
return $ngram;
}
}
}
function read_csv($filename) {
$csv = new SplFileObject($filename);
$cols = $csv->fgetcsv();
while (!$csv->eof() && [null] !== ($row = $csv->fgetcsv())) {
$data = array_combine($cols, $row);
yield $data;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment