поиск групп фраз связанных контекстом слов
import os
import glob
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# Скачиваем необходимые ресурсы nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
def load_sentences_from_txt(folder="."):
sentences = []
files = glob.glob(os.path.join(folder, "*.txt"))
for file in files:
with open(file, "r", encoding="utf-8") as f:
text = f.read()
sents = nltk.sent_tokenize(text)
sentences.extend(sents)
return sentences
def extract_key_phrases(sentences, n_clusters=10, n_phrases=20):
russian_stopwords = stopwords.words('russian')
vectorizer = TfidfVectorizer(stop_words=russian_stopwords, max_df=0.7)
X = vectorizer.fit_transform(sentences)
km = KMeans(n_clusters=n_clusters, random_state=42)
km.fit(X)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
clusters = {i: [] for i in range(n_clusters)}
for idx, label in enumerate(km.labels_):
clusters[label].append(sentences[idx])
largest_cluster = max(clusters.items(), key=lambda x: len(x[1]))[0]
print("Ключевые слова :")
for ind in order_centroids[largest_cluster, :10]:
print(terms[ind], end=", ")
print("\n")
selected_sentences = clusters[largest_cluster][:n_phrases]
return selected_sentences
def main():
sentences = load_sentences_from_txt(".")
if not sentences:
print("В папке нет txt файлов или они пусты.")
return
phrases = extract_key_phrases(sentences, n_clusters=10, n_phrases=20)
print("20 фраз, связанных по контексту:")
for i, phrase in enumerate(phrases, 1):
print(f"{i}. {phrase}")
if __name__ == "__main__":
main()
## другой вариант кода поиска фраз с общим контекстом групп слов
import os
import glob
import nltk
# Устанавливаем переменную окружения до импорта sklearn
os.environ["LOKY_MAX_CPU_COUNT"] = "6" # количество ядер вашего процессора
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# Скачиваем необходимые ресурсы nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
def load_sentences_from_txt(folder="."):
sentences = []
files = glob.glob(os.path.join(folder, "*.txt"))
for file in files:
with open(file, "r", encoding="utf-8") as f:
text = f.read()
sents = nltk.sent_tokenize(text)
sentences.extend(sents)
return sentences
def extract_key_phrases(sentences, n_clusters=10, n_phrases=20):
russian_stopwords = stopwords.words('russian')
vectorizer = TfidfVectorizer(stop_words=russian_stopwords, max_df=0.7)
X = vectorizer.fit_transform(sentences)
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # Явно указали n_init
km.fit(X)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
clusters = {i: [] for i in range(n_clusters)}
for idx, label in enumerate(km.labels_):
clusters[label].append(sentences[idx])
largest_cluster = max(clusters.items(), key=lambda x: len(x[1]))[0]
print("Ключевые слова:")
for ind in order_centroids[largest_cluster, :10]:
print(terms[ind], end=", ")
print("\n")
selected_sentences = clusters[largest_cluster][:n_phrases]
return selected_sentences
def main():
sentences = load_sentences_from_txt(".")
if not sentences:
print("В папке нет txt файлов или они пусты.")
return
phrases = extract_key_phrases(sentences, n_clusters=10, n_phrases=20)
print("20 фраз, связанных по контексту:")
for i, phrase in enumerate(phrases, 1):
print(f"{i}. {phrase}")
if __name__ == "__main__":
main()
Свидетельство о публикации №125061807778