Last active
July 31, 2018 10:50
-
-
Save 404mike/83b0a5f362a8e72bddf61c2e8763a8e0 to your computer and use it in GitHub Desktop.
ingest.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Transform the json data to solr xml | |
* clean the data | |
* ingest into solr | |
*/ | |
class Ingest { | |
/** | |
* Function to read in the json file | |
* @param string $file - path to the json file | |
*/ | |
public function getData($file) | |
{ | |
$json = file_get_contents($file); | |
$data = json_decode($json,true); | |
$this->formatData($data); | |
} | |
/** | |
* Format Data | |
* @param array $data | |
*/ | |
public function formatData($data) | |
{ | |
if(!isset($data['id'])) return; | |
// Create XML document | |
$xml_main = new SimpleXMLElement('<add/>'); | |
$xml = $xml_main->addChild('doc'); | |
// id | |
$id = $xml->addChild('field' , $data['id']); | |
$id->addAttribute('name', 'id'); | |
// title_en | |
if(isset($data['title']['en'])) { | |
$title_en = $xml->addChild('field' , $this->cleanData($data['title']['en'])); | |
$title_en->addAttribute('name','title_en'); | |
} | |
// title_cy | |
if(isset($data['title']['cy'])) { | |
$title_en = $xml->addChild('field' , $this->cleanData($data['title']['cy'])); | |
$title_en->addAttribute('name','title_cy'); | |
} | |
// nodeType | |
$nodeType = $xml->addChild('field' , $data['nodeType']); | |
$nodeType->addAttribute('name','nodeType'); | |
// created | |
$dateCreatedDay = date('Y-m-d',$data['created']); | |
$dateCreatedTime = date('H:i:s',$data['created']); | |
$created = $xml->addChild('field' , $dateCreatedDay . 'T' . $dateCreatedTime . 'Z'); | |
$created->addAttribute('name','created'); | |
// month_created | |
$dateCreatedDay = date('l',$data['created']); | |
$day_created = $xml->addChild('field' , $dateCreatedDay); | |
$day_created->addAttribute('name','day_created'); | |
// month_created | |
$dateCreatedDay = date('G',$data['created']); | |
$hour_created = $xml->addChild('field' , $dateCreatedDay); | |
$hour_created->addAttribute('name','hour_created'); | |
// updated | |
$dateupdatedDay = date('Y-m-d',$data['updated']); | |
$dateupdatedTime = date('H:i:s',$data['updated']); | |
$updated = $xml->addChild('field' , $dateupdatedDay . 'T' . $dateupdatedTime . 'Z'); | |
$updated->addAttribute('name','updated'); | |
// description_en | |
if(isset($data['description']['en'])) { | |
$description_en = $xml->addChild('field', $this->cleanData($data['description']['en'])); | |
$description_en->addAttribute('name','description_en'); | |
} | |
// description_cy | |
if(isset($data['description']['cy'])) { | |
$description_cy = $xml->addChild('field', $this->cleanData($data['description']['cy'])); | |
$description_cy->addAttribute('name','description_cy'); | |
} | |
// author_id | |
$author_id = $xml->addChild('field', $data['author']['id']); | |
$author_id->addAttribute('name','author_id'); | |
// author_name | |
$author_name = $xml->addChild('field', $this->cleanData($data['author']['name'])); | |
$author_name->addAttribute('name','author_name'); | |
// creator | |
if(isset($data['creator'])) { | |
$creator = $xml->addChild('field',$data['creator']); | |
$creator->addAttribute('name','creator'); | |
} | |
// copyright_holder_en | |
if(isset($data['copyright'][0]['holder']['en'])) { | |
$copyright_holder_en = $xml->addChild('field',$this->cleanData($data['copyright'][0]['holder']['en'])); | |
$copyright_holder_en->addAttribute('name','copyright_holder_en'); | |
} | |
// copyright_holder_cy | |
if(isset($data['copyright'][0]['holder']['cy'])) { | |
$copyright_holder_cy = $xml->addChild('field',$this->cleanData($data['copyright'][0]['holder']['cy'])); | |
$copyright_holder_cy->addAttribute('name','copyright_holder_cy'); | |
} | |
// copyright_year | |
if(isset($data['copyright'][0]['year'])) { | |
$copyright_year = $xml->addChild('field' , $data['copyright'][0]['year']); | |
$copyright_year->addAttribute('name','copyright_year'); | |
} | |
// copyright_type | |
if(isset($data['copyright'][0]['type'])) { | |
$copyright_type = $xml->addChild('field' , strtolower(trim($data['copyright'][0]['type']))); | |
$copyright_type->addAttribute('name','copyright_type'); | |
} | |
// licence_type | |
if(isset($data['license']['type'])) { | |
$licence_type = $xml->addChild('field',$data['license']['type']); | |
$licence_type->addAttribute('name','licence_type'); | |
} | |
// tags | |
if(isset($data['tags'])) { | |
if($data['nodeType'] == 'item') { | |
foreach($data['tags'] as $tag) { | |
$tag = $xml->addChild('field',$this->cleanData($tag)); | |
$tag->addAttribute('name','tags'); | |
} | |
} | |
else { | |
foreach($data['tags'] as $tag) { | |
$tag = $xml->addChild('field',$this->cleanData($tag['name'])); | |
$tag->addAttribute('name','tags'); | |
} | |
} | |
} | |
// when facet | |
if(isset($data['when'])) { | |
foreach($data['when'] as $when) { | |
$when = $xml->addChild('field',$this->cleanData($when)); | |
$when->addAttribute('name','when'); | |
} | |
} | |
// what facet | |
if(isset($data['what'])) { | |
foreach($data['what'] as $what) { | |
$what = $xml->addChild('field',$this->cleanData($what)); | |
$what->addAttribute('name','what'); | |
} | |
} | |
// learn facet | |
if(isset($data['learn'])) { | |
foreach($data['learn'] as $learn) { | |
$learn = $xml->addChild('field',$this->cleanData($learn)); | |
$learn->addAttribute('name','learn'); | |
} | |
} | |
// location | |
if(isset($data['locations'][0]['lat'])) { | |
$location = $xml->addChild('field' , $data['locations'][0]['lat'] . ',' . $data['locations'][0]['lon']); | |
$location->addAttribute('name', 'location'); | |
$location_fixed = $xml->addChild('field' , $data['locations'][0]['lat'] . ',' . $data['locations'][0]['lon']); | |
$location_fixed->addAttribute('name', 'location_fixed'); | |
$location_rpt = $xml->addChild('field' , $data['locations'][0]['lon'] . ' ' . $data['locations'][0]['lat']); | |
$location_rpt->addAttribute('name', 'location_rpt_geo'); | |
$location_rpt = $xml->addChild('field' , $data['locations'][0]['lon'] . ' ' . $data['locations'][0]['lat']); | |
$location_rpt->addAttribute('name', 'location_rpt_two_geo'); | |
$location_rpt = $xml->addChild('field' , $data['locations'][0]['lon'] . ' ' . $data['locations'][0]['lat']); | |
$location_rpt->addAttribute('name', 'geo_rpt_geo'); | |
} | |
// content_en | |
if(isset($data['content']['en'])) { | |
$content_en = $xml->addChild('field' , $this->cleanData($data['content']['en'])); | |
$content_en->addAttribute('name','content_en'); | |
} | |
// content_cy | |
if(isset($data['content']['cy'])) { | |
$content_cy = $xml->addChild('field' , $this->cleanData($data['content']['cy'])); | |
$content_cy->addAttribute('name','content_cy'); | |
} | |
// num_items | |
if($data['nodeType'] == 'item') { | |
$num_items = $xml->addChild('field' , count($data['files'])); | |
$num_items->addAttribute('name','num_items'); | |
} | |
elseif ($data['nodeType'] == 'trail') { | |
$num_items = $xml->addChild('field', count($data['collection_items'])); | |
$num_items->addAttribute('name','num_items'); | |
} | |
elseif ($data['nodeType'] == 'story') { | |
$num_items = $xml->addChild('field', count($data['collection_items'])); | |
$num_items->addAttribute('name','num_items'); | |
} | |
elseif ($data['nodeType'] == 'collection') { | |
$num_items = $xml->addChild('field', count($data['collection_items'])); | |
$num_items->addAttribute('name','num_items'); | |
} | |
// Header('Content-type: text/xml'); | |
// print($xml->asXML()); | |
$this->sendToSolr($xml_main->asXML()); | |
} | |
/** | |
* Solr ingest | |
* @param string $file [description] | |
*/ | |
private function sendToSolr($file) | |
{ | |
$url = "http://localhost:8984/solr/stats/update?commit=true"; | |
$post_string = $file; | |
$header = array("Content-type:text/xml; charset=utf-8"); | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_HTTPHEADER, $header); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_POST, 1); | |
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_string); | |
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); | |
curl_setopt($ch, CURLINFO_HEADER_OUT, 1); | |
$data = curl_exec($ch); | |
if (curl_errno($ch)) { | |
print "curl_error:" . curl_error($ch); | |
} else { | |
curl_close($ch); | |
$p = xml_parser_create(); | |
xml_parse_into_struct($p, $data, $vals, $index); | |
xml_parser_free($p); | |
if($vals[2]['value'] == '400') { | |
print_r($file); | |
die(); | |
} | |
} | |
} | |
/** | |
* Format a string for ingest | |
* @param string $string | |
* @return string $string2 | |
*/ | |
private function cleanData($string) | |
{ | |
$string1 = strip_tags($string); | |
$string2 = htmlspecialchars($string1); | |
return $string2; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment