Skip to content

Instantly share code, notes, and snippets.

@emeeks
Created August 8, 2012 21:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emeeks/3299027 to your computer and use it in GitHub Desktop.
Save emeeks/3299027 to your computer and use it in GitHub Desktop.
Create Node and Edge Tables from MALLET Topic-Modeling Output
<?php
header("Content-type: text/html; charset=utf-8");
$link = mysql_connect("localhost","gephi","gephi");
mysql_select_db("dh_stanford");
mysql_set_charset('utf8');
echo ("Start-");
echo date(DATE_RFC822);
echo ("<br>");
ini_set('memory_limit', '1024M');
//Thresholds for word incidence and topic connection levels
$word_threshold = 1;
$topic_threshold = .01;
//Initialize the two files we're working with
$topicDocFile = "topic_to_doc.txt";
$wordTopicFile = "word_to_topic.txt";
//unlink($myFile);
$fh = fopen($topicDocFile, 'r') or die("can't open file");
echo $topicDocFile;
echo ("<br>");
//It might run a while
set_time_limit(0);
//Set up the tables
/*
$query = "
DROP TABLE IF EXISTS `raw`;
CREATE TABLE `raw` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`source` int(11) DEFAULT NULL,
`target` int(11) DEFAULT NULL,
`weight` decimal(5,4) DEFAULT NULL,
`incidence` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=49801 DEFAULT CHARSET=utf8 ROW_FORMAT=FIXED;
DROP TABLE IF EXISTS `raw_nodes`;
CREATE TABLE `raw_nodes` (
`ID` int(11) NOT NULL DEFAULT '0',
`LABEL` varchar(255) DEFAULT NULL,
`TYPE` varchar(255) DEFAULT NULL,
PRIMARY KEY (`ID`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;
";
$write = mysql_query($query);
*/
//READ HEADER and throw it away
//This is the "#doc source topic proportion line and it's worthless
$buffer = fgets($fh);
//COMPUTE LENGTH
$splitcontents = explode($topicDocFile, $buffer);
$book_id = 0;
$book_label = "blank";
$delimiter = "\t";
while (!feof($fh)) {
$buffer = fgets($fh);
$counter = 0;
$splitcontents = explode($delimiter, $buffer);
$x = 0;
while ($x < count($splitcontents)) {
//GET ID
// The first column is your book's numerical ID
if ($x == 0) {
$book_id = $splitcontents[$x];
}
// The second column is the book's label
if ($x == 1) {
// If your book label is legible, uncomment this
//$book_label = $splitcontents[$x];
//else just concatenate "book" and the id value
$book_label = "Book " . $book_id;
$query = '
INSERT
INTO
raw_nodes
(`ID`, `LABEL`, `TYPE`)
VALUES
('.$book_id.', "'.$book_label.'", "BOOK")
';
$write = mysql_query($query);
}
else if ($x > 1) {
$adjusted_topic = $splitcontents[$x] + 4000000;
if ($splitcontents[$x+1] >= $topic_threshold) {
$query = '
INSERT
INTO
raw
(`id`, `source`, `target`, `weight`, `incidence`)
VALUES
(NULL, '.$book_id.', '.$adjusted_topic.', '.$splitcontents[$x+1].', 0)
';
$write = mysql_query($query);
}
//Double increment because we're looking at topic_weight pairs
$x++;
}
$x++;
}
}
echo("Finished-");
echo date(DATE_RFC822);
fclose($fh);
//Create Topic Nodes
$query = '
INSERT INTO
raw_nodes
SELECT DISTINCT
raw.target,
CONCAT("Topic ", (raw.target - 4000000)),
"TOPIC"
FROM
raw
';
$write = mysql_query($query);
//Word to Topic links
$fh = fopen($wordTopicFile, 'r') or die("can't open file");
$word_id = 0;
$word_label = "blank";
$delimiter = " ";
while (!feof($fh)) {
$buffer = fgets($fh);
$counter = 0;
$splitcontents = explode($delimiter, $buffer);
$x = 0;
while ($x < count($splitcontents)) {
//GET ID
/* The first column is MALLET's numerical ID for its words, we're going to modify this so as not to
interfere with our unique IDs for topics or books
*/
if ($x == 0) {
$word_id = $splitcontents[$x] + 2000000;
}
// The second column is the word
if ($x == 1) {
$word_label = $splitcontents[$x];
$query = '
INSERT
INTO
raw_nodes
(`ID`, `LABEL`, `TYPE`)
VALUES
('.$word_id.', "'.$word_label.'", "WORD")
';
$write = mysql_query($query);
}
else if ($x > 1) {
//This time MALLET presents us with ratio pairs, so we need to split those.
$splitpair = explode(":", $splitcontents[$x]);
$adjusted_topic = $splitpair[0] + 4000000;
if( $splitpair[1] >= $word_threshold) {
$query = '
INSERT
INTO
raw
(`id`, `source`, `target`, `weight`, `incidence`)
VALUES
(NULL, '.$word_id.', '.$adjusted_topic.', 1, '.$splitpair[1].')
';
$write = mysql_query($query);
//Double increment because we're looking at topic_weight pairs
}
// $x++;
}
$x++;
}
}
mysql_close($link);
fclose($fh);
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment