Skip to content

Instantly share code, notes, and snippets.

@zinedkaloc
Created August 15, 2023 20:13
Show Gist options
  • Save zinedkaloc/631f8e18322e792009125fa2f619b2d0 to your computer and use it in GitHub Desktop.
Save zinedkaloc/631f8e18322e792009125fa2f619b2d0 to your computer and use it in GitHub Desktop.
Analyse data.json and generate wordcloud
import json
from collections import Counter
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Load the data from a file
with open('data.json', 'r') as file:
data = json.load(file)
# Here, 'data' is now a Python dictionary containing your JSON data
contents = [item['content'] for item in data['result']]
# Use a regular expression to split the text into words
words = [re.findall(r'\w+', content.lower()) for content in contents]
words = [word for sublist in words for word in sublist]
# Define the list of stop words that you want to exclude
stop_words = {'a', 'of', 'for', 'page', 'and', 'the', 'to', 'with', 'in', 'that', 'my', 'is', 'an', 'on', 'it', 'i', 'as', 'creat', 'your', 'make', 'or', 's', 'system', 'services', 'me', 'our', 'can', 'you', 'we', 'have', 'their', 'from', 'by', 'e', 'like', 'be', 'called','t','4','us','1'}
# Filter out the stop words from the list of words
filtered_words = [word for word in words if word not in stop_words]
# Count the frequency of each word
word_count = Counter(filtered_words)
# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_count)
# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
# Show the plot
plt.show()
# Save the word cloud as a PNG file
wordcloud.to_file('wordcloud.png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment