Skip to content

Instantly share code, notes, and snippets.

View justindavies's full-sized avatar

Justin Davies justindavies

View GitHub Profile
import sys
import json
fo = open(sys.argv[1], "r")
lines = fo.readlines()
for line in lines:
@justindavies
justindavies / extract_ners.py
Last active May 19, 2020 00:42
Extract NER
from elasticsearch import Elasticsearch
import spacy
import os
import json
from pymongo import MongoClient
from spacy.pipeline import EntityRuler
import hashlib
import inflection
def set_custom_boundaries(doc):
[
{
"data": [
{
"x": 2013,
"y": 1
},
{
"x": 2014,
"y": 3
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: api-ingress
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "false"
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/use-regex: "true"
FROM nginx
RUN mkdir /usr/share/nginx/html/sitemaps
COPY *.xml /usr/share/nginx/html/sitemaps/
import urllib, json
import glob
import os
import pymongo
from pymongo import MongoClient
# Set the latest Block form the API
client = MongoClient(os.environ["DB"])
apiVersion: v1
kind: Service
metadata:
name: frontend
labels:
name: frontend
spec:
type: LoadBalancer
ports:
- name: http
apiVersion: batch/v1
kind: Job
metadata:
name: sitemap
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: sitemap
sitemapindex = '<?xml version="1.0" encoding="UTF-8"?>\n'
sitemapindex = sitemapindex + '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
for file in glob.glob('*.xml'):
sitemapindex = sitemapindex + '<sitemap>\n'
sitemapindex = sitemapindex + ' <loc>http://inkl.in/sitemaps/' + file + '</loc>\n'
sitemapindex = sitemapindex + '</sitemap>\n'
sitemapindex = sitemapindex + '</sitemapindex>'
for x in range(int(data["block_number"]), 1, -1):
line = line + "<url>\n"
line = line + "<loc>http://inkl.in/"+str(x)+ "</loc>\n"
line = line + "<changefreq>never</changefreq>"
line = line + "</url>\n"
counter=counter+1
if counter == 50000:
counter = 1
print("Writing " + str(filename))