Skip to content

Instantly share code, notes, and snippets.

@nicholasdunbar
Last active August 29, 2015 14:05
Show Gist options
  • Save nicholasdunbar/9c5388d243bd4dff48f2 to your computer and use it in GitHub Desktop.
Save nicholasdunbar/9c5388d243bd4dff48f2 to your computer and use it in GitHub Desktop.
For parsing columns from plain text with raged or unevenly spaced edges.
<?php
//To parse the following unevenly categorized columns:
/*
* col1 col2 col3
* ==== ==== ====
* 1 a b c d e 103 14 as d9
* 2 a 103 14 as d9
* 3 a 103 14 as d9
*
* Into structured data like this:
*
* col1 col2 col3
* ==== ==== ====
* 1 a b c d e 103 14 as d9
* 2 a 103 14 as d9
* 3 a 103 14 as d9
*/
$headings = array('col1','col2','col3');
$header = "col1 col2 col3";
//get the $heading_pos_list by parsing the headings of each column with
list($heading_pos_list, $lengths) = parse_heading($headings, $header);
//Parse each line into a row structure
$line = '1 a b c d e 103 14 as d9';
$row = parse_line_approx($line, $heading_pos_list);
//this works for each column and line
echo $row('col1');
//output:
//1 a b c d e
/************* Functions Below **************/
/**
* parse_heading
*
* Get the position and length of each column name in $heading
* if a column name is not in $line then remove it from $heading
*
* @param array $heading
* List of column names to get info about from $line
* Ex: array("Logical Drive Name", "LUN", "Controller", "Accessible by", "Logical Drive status");
*
* @param string $line
* The header for the columns
* Ex: "Logical Drive Name LUN Controller Accessible by Logical Drive status"
*
* @param boolean $is_logging
* Turn off log messages when a header is not found
*
* @return array $return_val
* Multidimentional array containing the start position of each column and length
* $return_val[0] = multi-element associative array
* Ex: $return_val[0]['Controller']
* $return_val[0]['Accessible by']
* $return_val[0] contains the starting position for each header
* $heading_key is the column name that has been like 'CONTROLLER' above
* Ex: $return_val[0][$heading_key] = starting position of heading
* $return_val[1] contains the length of each header
* Ex: $return_val[1][$heading_key] = length of heading including white space after it
*
*/
function parse_heading(&$heading, $line, $is_logging = TRUE) {
//search $line to find each start position of $heading
$start_pos=array();
$lengthOf=array();
$num_headings = count($heading);
for ($i = 0; $i < $num_headings; $i++){
$column_name = $heading[$i];
$matches = array();
$start = strpos($line, $column_name);
if (preg_match("/^.*?$column_name\s*/", $line, $matches)){
$end = strlen($matches[0]);
if($start !== FALSE) {
$start_pos[$column_name] = $start;
$lengthOf[$column_name] = $end-$start;
}
} else {
$heading[$i] = NULL;
if ($is_logging){
echo "Alert: Heading '$column_name' was not found.\n";
}
}
}
//filter the elements of the array using the function strlen,
//if strlen returns 0 then the element is removed
$heading = array_filter($heading, 'strlen');
//return start positions and lengths
return(array($start_pos, $lengthOf));
}
/**
* parse_line_approx
*
* For parsing columns with raged edges. Pull the data out of $line and
* put it into an associative array of fields and keys found in $heading
* Ex:
* Array(0=>Array('key'=>0),1=>Array('key'=>3)).
*
* If the positions are close, but off by a few characters, then adjust
* the start and end to the real edge of the column. This function is an
* improvement on parse_line because it deals with ragged edges of columns.
*
* Example Usage:
* Say we have the following data:
* col1 col2 col3
* ==== ==== ====
* 1 a b c d e 103 14 as d9
* 2 a 103 14 as d9
* 3 a 103 14 as d9
*
* If we want to transform the data into this
* col1 col2 col3
* ==== ==== ====
* 1 a b c d e 103 14 as d9
* 2 a 103 14 as d9
* 3 a 103 14 as d9
*
* We would do the following:
* 1.) get the $heading_pos_list by parsing the headings of each column with
* list($heading_pos_list,$lengths) = parse_heading(array('col1','col2','col3'),"col1 col2 col3")
* 2.) Parse each line into a row structure
* $row = parse_line_approx($line, $heading_pos_list)
* 3.) echo "$row('col1')" etc. etc. for each column and line
* 4.) format it into columns
*
* @param string $line
* The input to be processed
*
* @param array $heading_pos_list
* The return value you get from list($heading_pos_list,$lengths)=parse_heading(...)
* used to determine where columns start ($return_value['some_key']) in a line
*
* @return array
* An array of keys and coresponding values like
* array('some_key1'=>'column_value1','some_key2'=>'column_value2','etc'=>'etc')
*/
function parse_line_approx($line, $heading_pos_list) {
//init vars
//return variable
$row = array();
//list of whitespaces and values in $line
$chunks_array = array();
//list of positions of chunks in string $line
$chunks_pos_array = array();
//list of distances of each chunk from every heading
$chunks_dist_array = array();
//chunks as they are matched via preg_match_all
$matches = array();
//get chunks
preg_match_all("/\s+|[^\s]+/",$line,$matches,PREG_OFFSET_CAPTURE);
while ( $el = array_shift($matches[0]) ){
$chunks_array[] = $el[0];
$chunks_pos_array[] = $el[1];
}
//calculate distances between chunks and each column heading
$counter = 0;
$chunks_dist_array = array();
foreach ($heading_pos_list as $heading=>$start_val){
//begining of column
$start = $start_val;
foreach ($chunks_pos_array as $index=>$chunk_pos){
$distance = abs($start-$chunk_pos);
$chunks_dist_array[$counter][] = array( "heading"=>$heading,
"distance"=>$distance,
"chunk_index"=>$index,
"heading_pos"=>$start,
"chunk_pos"=>$chunk_pos,
"chunk_val"=>$chunks_array[$index] );
}
$counter++;
}
//categorize into each heading by whichever is closest to the heading
$processed_indexes = array();
foreach ($chunks_dist_array as $index=>$info){
if (!isset($chunks_dist_array[$index+1])){
//add remaining to the last column
for ($i=0;$i<$len;$i++){
if (!in_array($info[$i]['chunk_index'],$processed_indexes)){
if (!isset($row[$info[$i]['heading']])){
$row[$info[$i]['heading']] = "";
}
$row[$info[$i]['heading']] .= $info[$i]['chunk_val'];
$processed_indexes[] = $info[$i]['chunk_index'];
}
}
break;
}
$next_info = $chunks_dist_array[$index+1];
$len = count($info);
$next_heading_pos = $next_info[0]['heading_pos'];
$last_proc_index = 0;
for ($i=0;$i<$len;$i++){
//categorize into each heading by whichever is closest to the heading
/*
col1 col2 col3 col1 col2 col3
================================ ================================
a b c d 103 14 as d9 => a b c 103 14 as d9
*/
if ($info[$i]['distance'] < $next_info[$i]['distance']){
if (!in_array($info[$i]['chunk_index'],$processed_indexes)){
if (!isset($row[$info[$i]['heading']])){
$row[$info[$i]['heading']] = "";
}
$row[$info[$i]['heading']] .= $info[$i]['chunk_val'];
$processed_indexes[] = $info[$i]['chunk_index'];
$last_proc_index = $i;
}
} else if ($next_info[$i]['chunk_pos'] < $next_heading_pos ) {
//columns are not always seperated by which chunk is closer to a column
//take into account cases (like col1) with wide columns full of spaces:
/*
col1 col2 col3 col1 col2 col3
================================ ================================
a b c d e 103 14 as d9 => a b c d e 103 14 as d9
*/
//where the start of a col2 is demarcated by a larger space:
if ( preg_match("/^\s{2,}|\t$/",$info[$i]['chunk_val']) ){
//if there is a big space then break into previous heading
for ($j=$last_proc_index+1;$j<=$i;$j++){
$row[$info[$j]['heading']] .= $info[$j]['chunk_val'];
$processed_indexes[] = $info[$j]['chunk_index'];
}
//found a logical break, process next column
break 1;
}
} else {
//too far away to categorize in this heading. Move to next heading
break 1;
}
}
}
//clean leading and trailing whitespace
foreach ($row as $heading=>$col_val){
$row[$heading] = trim($col_val);
}
return $row;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment