Last active
April 24, 2024 06:40
-
-
Save Dre1k23/4ca15832d8fd416b504e848f9fb65ba7 to your computer and use it in GitHub Desktop.
parsing data frame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Создаем пустой список для хранения данных о навыках | |
skills_data = [] | |
# Группируем данные по навыкам и подсчитываем частотность | |
grouped_data = df.groupby("Ключевые навыки") | |
for skill, skill_data in grouped_data: | |
# Получаем содержание навыка из первой записи (так как оно одинаково для всех) | |
content = skill_data.iloc[0]['Описание вакансии'] | |
# Получаем частоту встречаемости | |
frequency = len(skill_data) | |
# Добавляем данные в список | |
skills_data.append({'Класс навыка': skill, 'Содержание': content, 'Частота встречаемости': frequency}) | |
# Создаем DataFrame из списка | |
skills_df = pd.DataFrame(skills_data) | |
# Выводим первые несколько строк DataFrame | |
skills_df.sort_values(by='Частота встречаемости', ascending=False) | |
# Создаем Excel-файл и записываем в него данные | |
skills_df.to_excel('навыки_вакансий.xlsx', index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Установка русского языка для nltk | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words("russian")) | |
# Подготовка текстовых данных | |
text_data = " ".join(df['Описание вакансии']) # Объединяем все тексты в одну строку | |
# Гистограмма частотности слов | |
words = text_data.split() | |
word_counter = Counter(words) | |
common_words = word_counter.most_common(20) # Топ-20 самых часто встречающихся слов | |
word_freq_df = pd.DataFrame(common_words, columns=['Слово', 'Частота']) | |
plt.figure(figsize=(10, 6)) | |
plt.bar(word_freq_df['Слово'], word_freq_df['Частота']) | |
plt.xticks(rotation=45) | |
plt.title('Частотность слов') | |
plt.xlabel('Слово') | |
plt.ylabel('Частота') | |
plt.show() | |
# Облако слов | |
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(text_data) | |
plt.figure(figsize=(10, 6)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.title('Облако слов') | |
plt.show() | |
# График распределения длин текстов | |
text_lengths = [len(text.split()) for text in df['Описание вакансии']] | |
plt.figure(figsize=(10, 6)) | |
plt.hist(text_lengths, bins=30) | |
plt.title('Распределение длин текстов') | |
plt.xlabel('Длина текста') | |
plt.ylabel('Количество') | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
grouped_data = df.groupby("Название региона") | |
# Подсчет частотности навыков для каждого региона | |
skills_frequency_by_region = {} | |
for region, region_data in grouped_data: | |
skills_frequency = {} | |
for skills_str in region_data["Ключевые навыки"]: | |
if isinstance(skills_str, str): | |
skills = skills_str.split(";") | |
for skill in skills: | |
if skill in skills_frequency: | |
skills_frequency[skill] += 1 | |
else: | |
skills_frequency[skill] = 1 | |
skills_frequency_by_region[region] = skills_frequency | |
# Визуализация результатов | |
for region, skills_frequency in skills_frequency_by_region.items(): | |
sorted_skills_frequency = sorted(skills_frequency.items(), key=lambda x: x[1], reverse=True) | |
top_skills = sorted_skills_frequency[:10] # Показываем только топ-10 навыков | |
if top_skills: # Проверка наличия навыков | |
skills, frequencies = zip(*top_skills) | |
plt.figure(figsize=(10, 6)) | |
plt.barh(skills, frequencies) | |
plt.title(f"Top Skills in {region}") | |
plt.xlabel("Frequency") | |
plt.ylabel("Skills") | |
plt.gca().invert_yaxis() | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pd.set_option('display.max_columns', 500) | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform(df["tokens"]) | |
scaler = StandardScaler() | |
X_scaled = scaler.fit_transform(X.toarray()) | |
pca = PCA(n_components=2) | |
X_pca = pca.fit_transform(X_scaled) | |
kmeans = KMeans(n_clusters=3) | |
kmeans.fit(X_pca) | |
df["cluster"] = kmeans.labels_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Предварительная обработка данных | |
stop_words = set(stopwords.words("russian")) | |
lemmatizer = WordNetLemmatizer() | |
def preprocess_text(text): | |
tokens = word_tokenize(text) | |
filtered_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token.isalnum()] | |
return " ".join(filtered_tokens) | |
df["tokens"] = df["Описание вакансии"].apply(preprocess_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pos_tagging(text): | |
tokens = word_tokenize(text) | |
return pos_tag(tokens) | |
df["pos_tags"] = df["Vacancy description"].apply(pos_tagging) | |
df["pos_tags"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def analyze_sentiment(text): | |
blob = TextBlob(text) | |
sentiment_score = blob.sentiment.polarity | |
if sentiment_score > 0: | |
return "Positive" | |
elif sentiment_score == 0: | |
return "Neutral" | |
else: | |
return "Negative" | |
df["sentiment"] = df["Описание вакансии"].apply(analyze_sentiment) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pos_tagging(text): | |
tokens = word_tokenize(text) | |
return pos_tag(tokens) | |
word_relations = defaultdict(lambda: defaultdict(int)) | |
for tokens in df["tokens"]: | |
for i in range(len(tokens) - 1): | |
word_relations[tokens[i]][tokens[i+1]] += 1 | |
# Print the 5 most frequently occurring words following each word | |
for word, next_words in word_relations.items(): | |
sorted_next_words = sorted(next_words.items(), key=lambda x: x[1], reverse=True)[:5] | |
print(f"Слово '{word}': {sorted_next_words}") | |
# Understanding the syntactic and semantic structure of sentences | |
def extract_named_entities(text): | |
words = word_tokenize(text) | |
tagged_words = pos_tag(words) | |
named_entities = ne_chunk(tagged_words) | |
# Create a dictionary of the frequency of the following words | |
word_relations = defaultdict(lambda: defaultdict(int)) | |
for i in range(len(tagged_words) - 1): | |
word_relations[tagged_words[i][0]][tagged_words[i+1][0]] += 1 | |
# We display the 5 most frequently occurring words following each word | |
for word, next_words in word_relations.items(): | |
sorted_next_words = sorted(next_words.items(), key=lambda x: x[1], reverse=True)[:5] | |
print(f"Word '{word}': {sorted_next_words}") | |
return named_entities | |
for idx, description in enumerate(df["Vacancy description"]): | |
print(f"\nSentence {idx + 1}:") | |
named_entities = extract_named_entities(description) | |
for entity in named_entities: | |
if isinstance(entity, Tree): | |
print(' '.join([child[0] for child in entity]), "-", entity.label()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all_skills = ';'.join(str(skill) for skill in df['Ключевые навыки'] if isinstance(skill, str)) | |
skills_list = all_skills.split(';') | |
skills_counter = Counter(skills_list) | |
sorted_skills = sorted(skills_counter.items(), key=lambda x: x[1], reverse=True) | |
print("Наиболее:") | |
for skill, frequency in sorted_skills[:10]: | |
print(f"{skill} - {frequency}") | |
sorted_skills = sorted(skills_counter.items(), key=lambda x: x[1]) | |
print("\nНаименее:") | |
for skill, frequency in sorted_skills[:10]: | |
print(f"{skill} - {frequency}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pandas as pd | |
url = 'https://api.hh.ru/vacancies' | |
params = {'text': 'python developer'} | |
try: | |
response = requests.get(url, params=params) | |
response.raise_for_status() # Проверяем статус ответа на ошибку | |
except requests.HTTPError as e: | |
print('Ошибка HTTP:', e) | |
print('Код ошибки:', response.status_code) | |
print('Текст ошибки:', response.text) | |
except Exception as e: | |
print('Ошибка:', e) | |
# Парсим JSON-ответ | |
data = response.json() | |
# Создаем DataFrame | |
df3 = pd.DataFrame(data['items']) | |
# Оставляем только нужные столбцы | |
df3 = df3[['name', 'alternate_url', 'salary', 'area', 'experience']] | |
# Переименовываем столбцы | |
df3.columns = ['Название', 'Ссылка', 'Зарплата', 'Регион', 'Опыт работы'] | |
# Выводим DataFrame | |
df3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Преобразование категориальных значений в числовые коды | |
label_encoder = LabelEncoder() | |
df1["Ключевые навыки"] = label_encoder.fit_transform(df1["Ключевые навыки"].fillna('Пусто')) | |
# Разделение данных на обучающий и тестовый наборы | |
X_train, X_test, y_train, y_test = train_test_split(X, df1["Ключевые навыки"], test_size=0.2, random_state=42) | |
# Создание и обучение модели SVC с линейным ядром | |
classifier = SVC(kernel="linear") | |
classifier.fit(X_train, y_train) | |
# Прогнозирование на тестовом наборе | |
y_pred = classifier.predict(X_test) | |
# Вывод отчета о классификации | |
print(classification_report(y_test, y_pred)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment