Skip to content

Instantly share code, notes, and snippets.

@404mike
Last active July 31, 2018 10:50
Show Gist options
  • Save 404mike/83b0a5f362a8e72bddf61c2e8763a8e0 to your computer and use it in GitHub Desktop.
Save 404mike/83b0a5f362a8e72bddf61c2e8763a8e0 to your computer and use it in GitHub Desktop.
ingest.php
<?php
/**
* Transform the json data to solr xml
* clean the data
* ingest into solr
*/
class Ingest {
/**
* Function to read in the json file
* @param string $file - path to the json file
*/
public function getData($file)
{
$json = file_get_contents($file);
$data = json_decode($json,true);
$this->formatData($data);
}
/**
* Format Data
* @param array $data
*/
public function formatData($data)
{
if(!isset($data['id'])) return;
// Create XML document
$xml_main = new SimpleXMLElement('<add/>');
$xml = $xml_main->addChild('doc');
// id
$id = $xml->addChild('field' , $data['id']);
$id->addAttribute('name', 'id');
// title_en
if(isset($data['title']['en'])) {
$title_en = $xml->addChild('field' , $this->cleanData($data['title']['en']));
$title_en->addAttribute('name','title_en');
}
// title_cy
if(isset($data['title']['cy'])) {
$title_en = $xml->addChild('field' , $this->cleanData($data['title']['cy']));
$title_en->addAttribute('name','title_cy');
}
// nodeType
$nodeType = $xml->addChild('field' , $data['nodeType']);
$nodeType->addAttribute('name','nodeType');
// created
$dateCreatedDay = date('Y-m-d',$data['created']);
$dateCreatedTime = date('H:i:s',$data['created']);
$created = $xml->addChild('field' , $dateCreatedDay . 'T' . $dateCreatedTime . 'Z');
$created->addAttribute('name','created');
// month_created
$dateCreatedDay = date('l',$data['created']);
$day_created = $xml->addChild('field' , $dateCreatedDay);
$day_created->addAttribute('name','day_created');
// month_created
$dateCreatedDay = date('G',$data['created']);
$hour_created = $xml->addChild('field' , $dateCreatedDay);
$hour_created->addAttribute('name','hour_created');
// updated
$dateupdatedDay = date('Y-m-d',$data['updated']);
$dateupdatedTime = date('H:i:s',$data['updated']);
$updated = $xml->addChild('field' , $dateupdatedDay . 'T' . $dateupdatedTime . 'Z');
$updated->addAttribute('name','updated');
// description_en
if(isset($data['description']['en'])) {
$description_en = $xml->addChild('field', $this->cleanData($data['description']['en']));
$description_en->addAttribute('name','description_en');
}
// description_cy
if(isset($data['description']['cy'])) {
$description_cy = $xml->addChild('field', $this->cleanData($data['description']['cy']));
$description_cy->addAttribute('name','description_cy');
}
// author_id
$author_id = $xml->addChild('field', $data['author']['id']);
$author_id->addAttribute('name','author_id');
// author_name
$author_name = $xml->addChild('field', $this->cleanData($data['author']['name']));
$author_name->addAttribute('name','author_name');
// creator
if(isset($data['creator'])) {
$creator = $xml->addChild('field',$data['creator']);
$creator->addAttribute('name','creator');
}
// copyright_holder_en
if(isset($data['copyright'][0]['holder']['en'])) {
$copyright_holder_en = $xml->addChild('field',$this->cleanData($data['copyright'][0]['holder']['en']));
$copyright_holder_en->addAttribute('name','copyright_holder_en');
}
// copyright_holder_cy
if(isset($data['copyright'][0]['holder']['cy'])) {
$copyright_holder_cy = $xml->addChild('field',$this->cleanData($data['copyright'][0]['holder']['cy']));
$copyright_holder_cy->addAttribute('name','copyright_holder_cy');
}
// copyright_year
if(isset($data['copyright'][0]['year'])) {
$copyright_year = $xml->addChild('field' , $data['copyright'][0]['year']);
$copyright_year->addAttribute('name','copyright_year');
}
// copyright_type
if(isset($data['copyright'][0]['type'])) {
$copyright_type = $xml->addChild('field' , strtolower(trim($data['copyright'][0]['type'])));
$copyright_type->addAttribute('name','copyright_type');
}
// licence_type
if(isset($data['license']['type'])) {
$licence_type = $xml->addChild('field',$data['license']['type']);
$licence_type->addAttribute('name','licence_type');
}
// tags
if(isset($data['tags'])) {
if($data['nodeType'] == 'item') {
foreach($data['tags'] as $tag) {
$tag = $xml->addChild('field',$this->cleanData($tag));
$tag->addAttribute('name','tags');
}
}
else {
foreach($data['tags'] as $tag) {
$tag = $xml->addChild('field',$this->cleanData($tag['name']));
$tag->addAttribute('name','tags');
}
}
}
// when facet
if(isset($data['when'])) {
foreach($data['when'] as $when) {
$when = $xml->addChild('field',$this->cleanData($when));
$when->addAttribute('name','when');
}
}
// what facet
if(isset($data['what'])) {
foreach($data['what'] as $what) {
$what = $xml->addChild('field',$this->cleanData($what));
$what->addAttribute('name','what');
}
}
// learn facet
if(isset($data['learn'])) {
foreach($data['learn'] as $learn) {
$learn = $xml->addChild('field',$this->cleanData($learn));
$learn->addAttribute('name','learn');
}
}
// location
if(isset($data['locations'][0]['lat'])) {
$location = $xml->addChild('field' , $data['locations'][0]['lat'] . ',' . $data['locations'][0]['lon']);
$location->addAttribute('name', 'location');
$location_fixed = $xml->addChild('field' , $data['locations'][0]['lat'] . ',' . $data['locations'][0]['lon']);
$location_fixed->addAttribute('name', 'location_fixed');
$location_rpt = $xml->addChild('field' , $data['locations'][0]['lon'] . ' ' . $data['locations'][0]['lat']);
$location_rpt->addAttribute('name', 'location_rpt_geo');
$location_rpt = $xml->addChild('field' , $data['locations'][0]['lon'] . ' ' . $data['locations'][0]['lat']);
$location_rpt->addAttribute('name', 'location_rpt_two_geo');
$location_rpt = $xml->addChild('field' , $data['locations'][0]['lon'] . ' ' . $data['locations'][0]['lat']);
$location_rpt->addAttribute('name', 'geo_rpt_geo');
}
// content_en
if(isset($data['content']['en'])) {
$content_en = $xml->addChild('field' , $this->cleanData($data['content']['en']));
$content_en->addAttribute('name','content_en');
}
// content_cy
if(isset($data['content']['cy'])) {
$content_cy = $xml->addChild('field' , $this->cleanData($data['content']['cy']));
$content_cy->addAttribute('name','content_cy');
}
// num_items
if($data['nodeType'] == 'item') {
$num_items = $xml->addChild('field' , count($data['files']));
$num_items->addAttribute('name','num_items');
}
elseif ($data['nodeType'] == 'trail') {
$num_items = $xml->addChild('field', count($data['collection_items']));
$num_items->addAttribute('name','num_items');
}
elseif ($data['nodeType'] == 'story') {
$num_items = $xml->addChild('field', count($data['collection_items']));
$num_items->addAttribute('name','num_items');
}
elseif ($data['nodeType'] == 'collection') {
$num_items = $xml->addChild('field', count($data['collection_items']));
$num_items->addAttribute('name','num_items');
}
// Header('Content-type: text/xml');
// print($xml->asXML());
$this->sendToSolr($xml_main->asXML());
}
/**
* Solr ingest
* @param string $file [description]
*/
private function sendToSolr($file)
{
$url = "http://localhost:8984/solr/stats/update?commit=true";
$post_string = $file;
$header = array("Content-type:text/xml; charset=utf-8");
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_string);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLINFO_HEADER_OUT, 1);
$data = curl_exec($ch);
if (curl_errno($ch)) {
print "curl_error:" . curl_error($ch);
} else {
curl_close($ch);
$p = xml_parser_create();
xml_parse_into_struct($p, $data, $vals, $index);
xml_parser_free($p);
if($vals[2]['value'] == '400') {
print_r($file);
die();
}
}
}
/**
* Format a string for ingest
* @param string $string
* @return string $string2
*/
private function cleanData($string)
{
$string1 = strip_tags($string);
$string2 = htmlspecialchars($string1);
return $string2;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment