當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

自动摘要

發布時間：2024/7/5 编程问答 29 豆豆

生活随笔收集整理的這篇文章主要介紹了自动摘要小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

#!/user/bin/python
# coding:utf-8

import nltk
import numpy
import jieba
import codecs
import os

class SummaryTxt:
? ? def __init__(self,stopwordspath):
? ? ? ? # 單詞數量
? ? ? ? self.N = 100
? ? ? ? # 單詞間的距離
? ? ? ? self.CLUSTER_THRESHOLD = 5
? ? ? ? # 返回的top n句子
? ? ? ? self.TOP_SENTENCES = 5
? ? ? ? self.stopwrods = {}
? ? ? ? print('???')
? ? ? ? #加載停用詞
? ? ? ? if os.path.exists(stopwordspath):
? ? ? ? ? ? print('!!!!')
? ? ? ? ? ? stoplist = [line.strip() for line in codecs.open(stopwordspath, 'r', encoding='utf8').readlines()]
? ? ? ? ? ? self.stopwrods = {}.fromkeys(stoplist)

? ? def _split_sentences(self,texts):
? ? ? ? '''
? ? ? ? 把texts拆分成單個句子，保存在列表里面，以（.!?。！？）這些標點作為拆分的意見，
? ? ? ? :param texts: 文本信息
? ? ? ? :return:
? ? ? ? '''
? ? ? ? splitstr = '.!?。！？'.encode('utf8').decode('utf8')
? ? ? ? start = 0
? ? ? ? index = 0 ?# 每個字符的位置
? ? ? ? sentences = []
? ? ? ? for text in texts:
? ? ? ? ? ? if text in splitstr: ?# 檢查標點符號下一個字符是否還是標點
? ? ? ? ? ? ? ? sentences.append(texts[start:index + 1]) ?# 當前標點符號位置
? ? ? ? ? ? ? ? start = index + 1 ?# start標記到下一句的開頭
? ? ? ? ? ? index += 1
? ? ? ? if start < len(texts):
? ? ? ? ? ? sentences.append(texts[start:]) ?# 這是為了處理文本末尾沒有標

? ? ? ? return sentences

? ? def _score_sentences(self,sentences, topn_words):
? ? ? ? '''
? ? ? ? 利用前N個關鍵字給句子打分
? ? ? ? :param sentences: 句子列表
? ? ? ? :param topn_words: 關鍵字列表
? ? ? ? :return:
? ? ? ? '''
? ? ? ? scores = []
? ? ? ? sentence_idx = -1
? ? ? ? for s in [list(jieba.cut(s)) for s in sentences]:
? ? ? ? ? ? sentence_idx += 1
? ? ? ? ? ? word_idx = []
? ? ? ? ? ? for w in topn_words:
? ? ? ? ? ? ? ? try:
? ? ? ? ? ? ? ? ? ? word_idx.append(s.index(w)) ?# 關鍵詞出現在該句子中的索引位置
? ? ? ? ? ? ? ? except ValueError: ?# w不在句子中
? ? ? ? ? ? ? ? ? ? pass
? ? ? ? ? ? word_idx.sort()
? ? ? ? ? ? if len(word_idx) == 0:
? ? ? ? ? ? ? ? continue
? ? ? ? ? ? # 對于兩個連續的單詞，利用單詞位置索引，通過距離閥值計算族
? ? ? ? ? ? clusters = []
? ? ? ? ? ? cluster = [word_idx[0]]
? ? ? ? ? ? i = 1
? ? ? ? ? ? while i < len(word_idx):
? ? ? ? ? ? ? ? if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
? ? ? ? ? ? ? ? ? ? cluster.append(word_idx[i])
? ? ? ? ? ? ? ? else:
? ? ? ? ? ? ? ? ? ? clusters.append(cluster[:])
? ? ? ? ? ? ? ? ? ? cluster = [word_idx[i]]
? ? ? ? ? ? ? ? i += 1
? ? ? ? ? ? clusters.append(cluster)
? ? ? ? ? ? # 對每個族打分，每個族類的最大分數是對句子的打分
? ? ? ? ? ? max_cluster_score = 0
? ? ? ? ? ? for c in clusters:
? ? ? ? ? ? ? ? significant_words_in_cluster = len(c)
? ? ? ? ? ? ? ? total_words_in_cluster = c[-1] - c[0] + 1
? ? ? ? ? ? ? ? score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
? ? ? ? ? ? ? ? if score > max_cluster_score:
? ? ? ? ? ? ? ? ? ? max_cluster_score = score
? ? ? ? ? ? scores.append((sentence_idx, max_cluster_score))
? ? ? ? return scores

? ? def summaryScoredtxt(self,text):
? ? ? ? # 將文章分成句子
? ? ? ? sentences = self._split_sentences(text)

? ? ? ? # 生成分詞
? ? ? ? words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
? ? ? ? ? ? ? ? ?len(w) > 1 and w != '\t']
? ? ? ? # words = []
? ? ? ? # for sentence in sentences:
? ? ? ? # ? ? for w in jieba.cut(sentence):
? ? ? ? # ? ? ? ? if w not in stopwords and len(w) > 1 and w != '\t':
? ? ? ? # ? ? ? ? ? ? words.append(w)

? ? ? ? # 統計詞頻
? ? ? ? wordfre = nltk.FreqDist(words)

? ? ? ? # 獲取詞頻最高的前N個詞
? ? ? ? topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

? ? ? ? # 根據最高的n個關鍵詞，給句子打分
? ? ? ? scored_sentences = self._score_sentences(sentences, topn_words)

? ? ? ? # 利用均值和標準差過濾非重要句子
? ? ? ? avg = numpy.mean([s[1] for s in scored_sentences]) ?# 均值
? ? ? ? std = numpy.std([s[1] for s in scored_sentences]) ?# 標準差
? ? ? ? summarySentences = []
? ? ? ? for (sent_idx, score) in scored_sentences:
? ? ? ? ? ? if score > (avg + 0.5 * std):
? ? ? ? ? ? ? ? summarySentences.append(sentences[sent_idx])
? ? ? ? ? ? ? ? print (sentences[sent_idx])
? ? ? ? return summarySentences

? ? def summaryTopNtxt(self,text):
? ? ? ? # 將文章分成句子
? ? ? ? sentences = self._split_sentences(text)

? ? ? ? # 根據句子列表生成分詞列表
? ? ? ? words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
? ? ? ? ? ? ? ? ?len(w) > 1 and w != '\t']
? ? ? ? # words = []
? ? ? ? # for sentence in sentences:
? ? ? ? # ? ? for w in jieba.cut(sentence):
? ? ? ? # ? ? ? ? if w not in stopwords and len(w) > 1 and w != '\t':
? ? ? ? # ? ? ? ? ? ? words.append(w)

? ? ? ? # 統計詞頻
? ? ? ? wordfre = nltk.FreqDist(words)

? ? ? ? # 獲取詞頻最高的前N個詞
? ? ? ? topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

? ? ? ? # 根據最高的n個關鍵詞，給句子打分
? ? ? ? scored_sentences = self._score_sentences(sentences, topn_words)

? ? ? ? top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-self.TOP_SENTENCES:]
? ? ? ? top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
? ? ? ? summarySentences = []
? ? ? ? for (idx, score) in top_n_scored:
? ? ? ? ? ? print (sentences[idx])
? ? ? ? ? ? summarySentences.append(sentences[idx])

? ? ? ? return sentences

if __name__=='__main__':
? ? obj =SummaryTxt('E:\comments\cn_stopwords.txt')
? ? with open('E:\comments\data.txt',"r") as f: ? ?#設置文件對象
? ? ? ? txt= f.read() ? ?#可以是隨便對文件的操作
? ? print (txt)
? ? print ("--")
? ? obj.summaryScoredtxt(txt)

? ? print ("----")
? ? obj.summaryTopNtxt(txt)

總結

以上是生活随笔為你收集整理的自动摘要的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

摘要

上一篇：判断平台是windows还是linux，
下一篇： python中的_init_的使用