Last active
October 24, 2015 16:10
-
-
Save tiborsimon/8a52c046df3a5cae0c59 to your computer and use it in GitHub Desktop.
Generate noun list from wiki site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import json | |
words = [] | |
def main(): | |
next_link = 'https://hu.wiktionary.org/wiki/Kategória:magyar_főnevek' | |
try: | |
while(1): | |
r = requests.get(next_link) | |
if r.status_code != 200: | |
break | |
soup = BeautifulSoup(r.text, 'html.parser') | |
extract_words_from_soup(soup) | |
with open('words.json', 'w') as outfile: | |
json.dump(words, outfile) | |
next_link = 'https://hu.wiktionary.org' + \ | |
soup.find('a', string='következő oldal')['href'] | |
print('Word count: {} next_link: {}'.format(len(words), next_link)) | |
except TypeError: | |
print('\nFinished with {} words'.format(len(words))) | |
def extract_words_from_soup(soup): | |
for li in soup.find_all('li'): | |
try: | |
word = li.a.string | |
if word == 'magyar szótár': | |
break | |
if word != 'magyar főnévi alakok': | |
words.append(word) | |
except: | |
pass | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment