jlecour/mapping.json

## mapping.json
{
    "mappings": {
        "tweet": {
            "properties": {
                "created_at": {
                    "format": "YYYY-MM-dd HH:mm:ss Z",
                    "type": "date"
                },
                "entities": {
                    "properties": {
                        "hashtags": {
                            "properties": {
                                "indices": {
                                    "type": "long"
                                },
                                "text": {
                                    "type": "string"
                                }
                            }
                        },
                        "urls": {
                            "properties": {
                                "display_url": {
                                    "type": "string"
                                },
                                "expanded_url": {
                                    "index": "not_analyzed",
                                    "type": "string"
                                },
                                "indices": {
                                    "type": "long"
                                },
                                "url": {
                                    "type": "string"
                                }
                            }
                        },
                        "media": {
                            "properties": {
                                "display_url": {
                                    "type": "string"
                                },
                                "expanded_url": {
                                    "index": "not_analyzed",
                                    "type": "string"
                                },
                                "indices": {
                                    "type": "long"
                                },
                                "url": {
                                    "type": "string"
                                },
                                "media_url": {
                                    "type": "string"
                                },
                                "media_url_https": {
                                    "type": "string"
                                },
                                "sizes": {
                                    "properties": {
                                        "h": {
                                            "type": "long"
                                        },
                                        "resize": {
                                            "type": "string"
                                        },
                                        "w": {
                                            "type": "long"
                                        }
                                    }
                                }
                            }
                        }
                    }
                },
                "geo": {
                    "type": "geo_point"
                },
                "id": {
                    "type": "long"
                },
                "id_str": {
                    "type": "string"
                },
                "source": {
                    "index": "not_analyzed",
                    "type": "string"
                },
                "text": {
                    "type": "string"
                },
                "user": {
                    "properties": {
                        "id": {
                            "type": "long"
                        },
                        "id_str": {
                            "type": "string"
                        },
                        "name": {
                            "index": "not_analyzed",
                            "type": "string"
                        },
                        "profile_image_url_https": {
                            "type": "string"
                        },
                        "protected": {
                            "type": "boolean"
                        },
                        "screen_name": {
                            "type": "string"
                        },
                        "verified": {
                            "type": "boolean"
                        }
                    }
                }
            }
        }
    }
}

## tweets_to_es.rb
#!/usr/bin/env ruby

require "pathname"
require "elasticsearch"
require "json"

client = Elasticsearch::Client.new

data_dir = Pathname.new("./data/js/tweets/")

data_dir.each_child do |file|
  puts file
  file_content = file.read
  raw_tweets = file_content.gsub(/\A[^\[]+/,'')
  tweets = JSON.parse(raw_tweets)

  tweets.each do |tweet|
    id = tweet["id"]
    body = tweet

    if body["geo"].empty?
      body.delete("geo")
    else
      body["geo"] = body["geo"]["coordinates"].reverse!
    end

    client.index  index: 'twitter', type: 'tweet', id: id, body: body
  end

end
puts "Done."
	{
	"mappings": {
	"tweet": {
	"properties": {
	"created_at": {
	"format": "YYYY-MM-dd HH:mm:ss Z",
	"type": "date"
	},
	"entities": {
	"properties": {
	"hashtags": {
	"properties": {
	"indices": {
	"type": "long"
	},
	"text": {
	"type": "string"
	}
	}
	},
	"urls": {
	"properties": {
	"display_url": {
	"type": "string"
	},
	"expanded_url": {
	"index": "not_analyzed",
	"type": "string"
	},
	"indices": {
	"type": "long"
	},
	"url": {
	"type": "string"
	}
	}
	},
	"media": {
	"properties": {
	"display_url": {
	"type": "string"
	},
	"expanded_url": {
	"index": "not_analyzed",
	"type": "string"
	},
	"indices": {
	"type": "long"
	},
	"url": {
	"type": "string"
	},
	"media_url": {
	"type": "string"
	},
	"media_url_https": {
	"type": "string"
	},
	"sizes": {
	"properties": {
	"h": {
	"type": "long"
	},
	"resize": {
	"type": "string"
	},
	"w": {
	"type": "long"
	}
	}
	}
	}
	}
	}
	},
	"geo": {
	"type": "geo_point"
	},
	"id": {
	"type": "long"
	},
	"id_str": {
	"type": "string"
	},
	"source": {
	"index": "not_analyzed",
	"type": "string"
	},
	"text": {
	"type": "string"
	},
	"user": {
	"properties": {
	"id": {
	"type": "long"
	},
	"id_str": {
	"type": "string"
	},
	"name": {
	"index": "not_analyzed",
	"type": "string"
	},
	"profile_image_url_https": {
	"type": "string"
	},
	"protected": {
	"type": "boolean"
	},
	"screen_name": {
	"type": "string"
	},
	"verified": {
	"type": "boolean"
	}
	}
	}
	}
	}
	}
	}
	#!/usr/bin/env ruby

	require "pathname"
	require "elasticsearch"
	require "json"

	client = Elasticsearch::Client.new

	data_dir = Pathname.new("./data/js/tweets/")

	data_dir.each_child do \|file\|
	puts file
	file_content = file.read
	raw_tweets = file_content.gsub(/\A[^\[]+/,'')
	tweets = JSON.parse(raw_tweets)

	tweets.each do \|tweet\|
	id = tweet["id"]
	body = tweet

	if body["geo"].empty?
	body.delete("geo")
	else
	body["geo"] = body["geo"]["coordinates"].reverse!
	end

	client.index index: 'twitter', type: 'tweet', id: id, body: body
	end

	end
	puts "Done."