Skip to content

Instantly share code, notes, and snippets.

@srinathperera
Created October 15, 2020 09:13
Show Gist options
  • Save srinathperera/fc35d458c8be7c1a6f781c2b9b5b72b3 to your computer and use it in GitHub Desktop.
Save srinathperera/fc35d458c8be7c1a6f781c2b9b5b72b3 to your computer and use it in GitHub Desktop.
import json
import pandas as pd
import re
def load_ballerina_docs(file_name):
with open(file_name) as json_file:
data = json.load(json_file)
rows = data["contentIndex"]
print(len(rows))
print(rows[0].keys())
data = [[rows[i]["page"], rows[i]["name"], rows[i]["summary"], rows[i]["content"]] for i in range(len(rows))]
df = pd.DataFrame(data, columns=['page', 'name', 'summary', 'content'])
#print(df.head(100))
df.to_csv("ballerina_content.csv")
f = open("ballerina_content.txt", "w")
for index, row in df.iterrows():
summary = row["summary"]
summary = re.sub('\s+', ' ', summary)
summary = re.sub('\?+', ' ', summary)
summary = summary.replace('...', '')
if "/learn" in row["page"]:
f.write(summary)
f.write("???")
f.write(row["page"])
f.write("<|endoftext|>\n")
f.close()
for index, row in df.iterrows():
summary = row["summary"]
summary = re.sub('\s+', ' ', summary)
summary = re.sub('\?+', ' ', summary)
summary = summary.replace('...', '')
page = row["page"]
page = page.replace('/', '_')
if "/learn" in row["page"]:
f = open("solr_docs/"+ page + ".txt", "w")
f.write(summary)
f.close()
'''
[0]["rows"]
df = pd.DataFrame(rows, columns=["time", "container", "memory"])
df["memory"] = df["memory"]/1000000
df["container"] = [v.split("/")[-1] for v in df["container"].values]
'''
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment