Skip to content

Instantly share code, notes, and snippets.

@tiborsimon
Last active October 24, 2015 16:10
Show Gist options
  • Save tiborsimon/8a52c046df3a5cae0c59 to your computer and use it in GitHub Desktop.
Save tiborsimon/8a52c046df3a5cae0c59 to your computer and use it in GitHub Desktop.
Generate noun list from wiki site
import requests
from bs4 import BeautifulSoup
import json
words = []
def main():
next_link = 'https://hu.wiktionary.org/wiki/Kategória:magyar_főnevek'
try:
while(1):
r = requests.get(next_link)
if r.status_code != 200:
break
soup = BeautifulSoup(r.text, 'html.parser')
extract_words_from_soup(soup)
with open('words.json', 'w') as outfile:
json.dump(words, outfile)
next_link = 'https://hu.wiktionary.org' + \
soup.find('a', string='következő oldal')['href']
print('Word count: {} next_link: {}'.format(len(words), next_link))
except TypeError:
print('\nFinished with {} words'.format(len(words)))
def extract_words_from_soup(soup):
for li in soup.find_all('li'):
try:
word = li.a.string
if word == 'magyar szótár':
break
if word != 'magyar főnévi alakok':
words.append(word)
except:
pass
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment