Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Easy to use script for pulling answered questions from the stackexchange api
import requests
from bs4 import BeautifulSoup
from helpers.simple_mongo import SimpleMongo
from stackapi import StackAPI
import jsonlines
import openai
import os
"""
{'
tags': ['formatting', 'scientific-publishing', 'synopsis'],
'owner': {
'reputation': 11,
del 'user_id': 51501,
'user_type': 'registered',
del 'profile_image': 'https://www.gravatar.com/avatar/e84010a735486fd0d367e8160fa41c6c?s=128&d=identicon&r=PG&f=1',
del 'display_name': 'Arpita ',
chg 'link': 'https://writing.stackexchange.com/users/51501/arpita' // _id
},
'is_answered': False,
'view_count': 10,
'answer_count': 1,
'score': 0,
'last_activity_date': 1633766298, /// arrow.get('')
'creation_date': 1633758860, /// arrow.get('')
'question_id': 59252, /// _id
'content_license': 'CC BY-SA 4.0', /// don't need lol (of course you do for prod apps, this ain't prod)
'link': 'https://writing.stackexchange.com/questions/59252/how-do-i-write-a-synopsis-for-a-scientific-article-publication',
'title': 'How do I write a synopsis for a scientific article? publication?'
}
"""
""" the response for the fetch "questions" has these fields:
'backoff' = {int} 0
'has_more' = {bool} True
'page' = {int} 1
'quota_max' = {int} 300
'quota_remaining' = {int} 298
'total' = {int} 0
"""
def process_question(question):
question_link = question['link']
print(f'Processing {question_link} ...')
se_page_response = requests.get(question_link)
soup = BeautifulSoup(se_page_response.text, 'html.parser')
se_page_data = dict(**question)
question_ele = soup.find('div', attrs={'class': 'question'})
question_content_ele = question_ele.find('div', attrs={'class': 's-prose js-post-body'})
question_text = question_content_ele.text
se_page_data['question'] = question_text
answers = []
for answer in soup.find_all('div', attrs={'class': 'answer'}):
answer_content = answer.find('div', attrs={'class': 's-prose js-post-body'})
answer_content_text = answer_content.text
answers.append(answer_content_text)
se_page_data['answers'] = answers
return se_page_data
def gather_answered_questions(output_collection):
writing_se_api = StackAPI('writing')
writing_se_api.page_size = 100
writing_se_api.max_pages = 5
puzzles_se_api = StackAPI('puzzling')
puzzles_se_api.page_size = 100
puzzles_se_api.max_pages = 5
philosophy_se_api = StackAPI('philosophy')
philosophy_se_api.page_size = 100
philosophy_se_api.max_pages = 5
count_just_cuz = 0
while True:
all_qs = []
all_qs.extend(writing_se_api.fetch('questions')['items'])
all_qs.extend(puzzles_se_api.fetch('questions')['items'])
all_qs.extend(philosophy_se_api.fetch('questions')['items'])
for question in all_qs:
if question['answer_count'] > 0:
try:
qa_data = process_question(question)
output_collection.insert_one(document=qa_data)
except Exception as e:
print(e.args)
print(f'Progress {count_just_cuz}!')
count_just_cuz += 1
def main():
openai.api_key = os.getenv('OPENAI_API_KEY')
OUT_DIR = '.../output/' # Change this
filename = 'generalized_question_answerer.jsonl'
filepath_out = f'{OUT_DIR}{filename}'
_database = 'question_answering'
_writing_qa_col_name = 'writing_qa'
_puzzles_qa_col_name = 'puzzles_qa'
simple_mongo = SimpleMongo(_database, [_writing_qa_col_name, _puzzles_qa_col_name])
writing_qa_col = simple_mongo.collections[_writing_qa_col_name]
# Of course, uncomment this when you want to run it. It won't exit anytime soon...
# gather_answered_questions(writing_qa_col)
json_records = []
for answered_question in writing_qa_col.find({}):
title = answered_question['title']
question = answered_question['question'].replace('\n', '')
prompt = f'{title}\n\n{question}'
answer_top = answered_question['answers'][0].replace('\n','')
record = {
'rank': answered_question['score'],
'prompt': f'{prompt}\n\n###\n\n',
'completion': answer_top
}
json_records.append(record)
sorted(json_records, key=lambda x: x['rank'])
for json_record in json_records:
del(json_record['rank'])
guess_max = 4500
with jsonlines.open(filepath_out, mode='w') as writer:
for i, json_record in enumerate(json_records):
try:
writer.write(json_record)
except Exception as e:
print(e)
if i > 0 and i % 100 == 0:
print(f'Processed {i} so far...')
if i >= guess_max:
break
response = openai.File.create(
file=open(filepath_out),
purpose='fine-tune'
)
print(response)
response = openai.FineTune.create(
training_file=response['id'],
n_epochs=4,
learning_rate_multiplier=0.07, # 0.01-0.4
batch_size=40,
use_packing=True,
prompt_loss_weight=1.00
)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment