Created
August 23, 2014 06:44
-
-
Save nicholasdunbar/af73c436fbb0810fd95e to your computer and use it in GitHub Desktop.
Parse columns from pure text into structured data based on the precise spacing of the header and the columns.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
//To parse an example like the following categorized columns: | |
/* | |
col1 col2 col3 | |
==== ==== ==== | |
1 a b c d e 103 14 as d9 | |
2 a 103 14 as d9 | |
3 a 103 14 as d9 | |
*/ | |
$headings = array('col1','col2','col3'); | |
$header = "col1 col2 col3"; | |
//get the $heading_pos_list by parsing the headings of each column with | |
list($heading_pos_list, $lengths) = parse_heading($headings, $header); | |
//Parse each line into a row structure | |
$start_position = $heading_pos_list[0][$heading_key]; | |
$length_of_heading = $heading_pos_list[1][$heading_key]; | |
$line = '1 a b c d e 103 14 as d9'; | |
$row = parse_line($line, $headings, $start_position, $length_of_heading); | |
//this works for each column and line | |
echo $row('col1'); | |
//output: | |
//1 a b c d e | |
//continue on for each line. | |
/************* Functions Below **************/ | |
/** | |
* parse_heading | |
* | |
* Get the position and length of each column name in $heading | |
* if a column name is not in $line then remove it from $heading | |
* | |
* @param array $heading | |
* List of column names to get info about from $line | |
* Ex: array("Logical Drive Name", "LUN", "Controller", "Accessible by", "Logical Drive status"); | |
* | |
* @param string $line | |
* The header for the columns | |
* Ex: "Logical Drive Name LUN Controller Accessible by Logical Drive status" | |
* | |
* @param boolean $is_logging | |
* Turn off log messages when a header is not found | |
* | |
* @return array $return_val | |
* Multidimentional array containing the start position of each column and length | |
* $return_val[0] = multi-element associative array | |
* Ex: $return_val[0]['Controller'] | |
* $return_val[0]['Accessible by'] | |
* $return_val[0] contains the starting position for each header | |
* $heading_key is the column name that has been like 'CONTROLLER' above | |
* Ex: $return_val[0][$heading_key] = starting position of heading | |
* $return_val[1] contains the length of each header | |
* Ex: $return_val[1][$heading_key] = length of heading including white space after it | |
* | |
*/ | |
function parse_heading(&$heading, $line, $is_logging = TRUE) { | |
//search $line to find each start position of $heading | |
$start_pos=array(); | |
$lengthOf=array(); | |
$num_headings = count($heading); | |
for ($i = 0; $i < $num_headings; $i++){ | |
$column_name = $heading[$i]; | |
$matches = array(); | |
$start = strpos($line, $column_name); | |
if (preg_match("/^.*?$column_name\s*/", $line, $matches)){ | |
$end = strlen($matches[0]); | |
if($start !== FALSE) { | |
$start_pos[$column_name] = $start; | |
$lengthOf[$column_name] = $end-$start; | |
} | |
} else { | |
$heading[$i] = NULL; | |
if ($is_logging){ | |
echo "Alert: Heading '$column_name' was not found.\n"; | |
} | |
} | |
} | |
//filter the elements of the array using the function strlen, | |
//if strlen returns 0 then the element is removed | |
$heading = array_filter($heading, 'strlen'); | |
//return start positions and lengths | |
return(array($start_pos, $lengthOf)); | |
} | |
/** | |
* parse_line | |
* | |
* Pull the data out of the line into an array of fields based on the | |
* start position and length of each column. | |
* | |
* @param string $line | |
* The input to be processed | |
* | |
* @param array $heading | |
* The return value you get from the function parse_heading(...) used to determine | |
* where columns end and start for different columns on a line | |
* Example: | |
* $start_position = $heading[0][$heading_key]; | |
* $length_of_heading = $heading[1][$heading_key]; | |
* | |
* @param array $start_pos | |
* @param array $lengthOf | |
* @return array | |
*/ | |
function parse_line($line, $heading, $start_pos, $lengthOf) { | |
foreach($heading as $field) { | |
if (! isset($start_pos[$field])) { | |
$arr[$field]=""; | |
} elseif ($lengthOf[$field] == 0) { | |
$arr[$field]=trim(substr($line, $start_pos[$field])); | |
} else { | |
$arr[$field]=trim(substr($line, $start_pos[$field], $lengthOf[$field])); | |
} | |
} | |
return($arr); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment