Python实现Flesch阅读易读性公式计算
生活随笔
收集整理的這篇文章主要介紹了
Python实现Flesch阅读易读性公式计算
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
計算英文文本的可讀性:
# 1.計算單詞數 # 2.計算句子數 # 3.計算音節數 # 計算RE值 # RE = 206.835 - (1.015 x ASL) - (84.6 x ASW) # RE =可讀性緩解 # ASL =平均句子長度(即單詞數除以句子數) # ASW =每個單詞的平均音節數(即,音節數除以單詞數) import re import pronouncingdef word_list(filename):'''返回單詞列表'''try:with open(filename, 'r', encoding='UTF-8') as f:content = f.read()except FileNotFoundError:errmsg = filename + '文件不存在'print(errmsg)else:word_re = re.compile(r'[^A-Za-z’\']+')words = word_re.split(content.lower())return wordsdef sentence_count(filename):'''計算句子長度'''try:with open(filename, 'r', encoding='UTF-8') as f:content = f.read()except FileNotFoundError:errmsg = filename + '文件不存在'else:point_re = re.compile(r'\.')point = point_re.split(content)# print('句子長度:' + str(point))return (len(point))def get_pronouncing_num(word):'''計算單詞音節數'''# https://pronouncing.readthedocs.io/en/latest/tutorial.html#counting-syllablestry:pronunciation_list = pronouncing.phones_for_word(word)num = pronouncing.syllable_count(pronunciation_list[0])except Exception as e:print('計算音節數異常:異常單詞:"' + word + '"')return 1else:return numdef get_pronouncing_nums(words):'''計算文本音節總數'''counts = 0for word in words:counts += get_pronouncing_num(word)return counts# 計算RE值 # RE = 206.835 - (1.015 x ASL) - (84.6 x ASW) # RE =可讀性緩解 # ASL =平均句子長度(即單詞數除以句子數) # ASW =每個單詞的平均音節數(即,音節數除以單詞數)if __name__ == '__main__':filename = 'detail.txt'# 求ASL 單詞數/句子數word_num = len(word_list(filename))sentence_num = sentence_count(filename)print(str(word_num) + ',' + str(sentence_num))ASL = word_num / sentence_num# 求ASW 音節數/單詞數 pronouncing_num/word_numwords = word_list(filename)print(len(words))pronouncing_nums = get_pronouncing_nums(words)ASW = pronouncing_nums / word_num# 求RE = 206.835 - (1.015 x ASL) - (84.6 x ASW)RE = 206.835 - (1.015 * ASL) - (84.6 * ASW)print('ASW:' + str(ASW))print('ASL:' + str(ASL))print('RE:' + str(RE))計算中文文本的可讀性:
# 計算文本可讀性 import re import jieba import cntext as ct import numpy as np STOPWORDS_zh = ct.load_pkl_dict(file='STOPWORDS.pkl')['STOPWORDS']['chinese'] STOPWORDS_en = ct.load_pkl_dict(file='STOPWORDS.pkl')['STOPWORDS']['english'] ADV_words = ct.load_pkl_dict(file='ADV_CONJ.pkl')['ADV'] CONJ_words = ct.load_pkl_dict(file='ADV_CONJ.pkl')['CONJ']# 中文分詞 def cn_seg_sent(text):#split the chinese text into sentencestext = re.sub('([。!;?;\?])([^”’])', "[[end]]", text) # 單字符斷句符text = re.sub('([。!?\?][”’])([^,。!?\?])', "[[end]]", text)text = re.sub('\s', '', text)# 如果雙引號前有終止符,那么雙引號才是句子的終點,把分句符\n放到雙引號后,注意前面的幾句都小心保留了雙引號return text.split("[[end]]")def readability(text, zh_advconj=None, lang='chinese'):"""text readability, the larger the indicator, the higher the complexity of the article and the worse the readability.:param text: text string:param zh_advconj Chinese conjunctions and adverbs, receive list data type. By default, the built-in dictionary of cntext is used:param language: "chinese" or "english"; default is "chinese"------------【English readability】english_readability = 4.71 x (characters/words) + 0.5 x (words/sentences) - 21.43;【Chinese readability】 Refer 【徐巍,姚振曄,陳冬華.中文年報可讀性:衡量與檢驗[J].會計研究,2021(03):28-44.】readability1 ---每個分句中的平均字數readability2 ---每個句子中副詞和連詞所占的比例readability3 ---參考Fog Index, readability3=(readability1+readability2)×0.5以上三個指標越大,都說明文本的復雜程度越高,可讀性越差。"""if lang=='english':text = text.lower()#將浮點數、整數替換為numtext = re.sub('\d+\.\d+|\.\d+', 'num', text)num_of_characters = len(text)#英文分詞rgx = re.compile("(?:(?:[^a-zA-Z]+')|(?:'[^a-zA-Z]+))|(?:[^a-zA-Z']+)")num_of_words = len(re.split(rgx, text))#分句num_of_sentences = len(re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text))ari = (4.71 * (num_of_characters / num_of_words)+ 0.5 * (num_of_words / num_of_sentences)- 21.43)return {"readability": ari}if lang=='chinese':if zh_advconj:adv_conj_words = zh_advconjelse:adv_conj_words = set(ADV_words + CONJ_words)zi_num_per_sent = []adv_conj_ratio_per_sent = []text = re.sub('\d+\.\d+|\.\d+', 'num', text)#【分句】sentences = cn_seg_sent(text)for sent in sentences:adv_conj_num = 0zi_num_per_sent.append(len(sent))words = list(jieba.cut(sent))for w in words:if w in adv_conj_words:adv_conj_num+=1adv_conj_ratio_per_sent.append(adv_conj_num/(len(words)+1))readability1 = np.mean(zi_num_per_sent)readability2 = np.mean(adv_conj_ratio_per_sent)readability3 = (readability1+readability2)*0.5return {'readability1': readability1,'readability2': readability2,'readability3': readability3} text1 = "我是個小孩子,我想快快樂樂地成長,慢慢長大。" text2 = '趙客縵胡纓,吳鉤霜雪明。銀鞍照白馬,颯沓如流星。十步殺一人,千里不留行。事了拂衣去,深藏身與名。閑過信陵飲,脫劍膝前橫。將炙啖朱亥,持觴勸侯嬴。三杯吐然諾,五岳倒為輕。眼花耳熱后,意氣素霓生。救趙揮金槌,邯鄲先震驚。千秋二壯士,烜赫大梁城。縱死俠骨香,不慚世上英。誰能書閣下,白首太玄經。' print(readability(text1,lang='chinese')) print(readability(text2,lang='chinese'))來源:
- https://blog.csdn.net/granery/article/details/88912059
- https://mp.weixin.qq.com/s/kgqRavPtoUq3ZPLrpooSrA
總結
以上是生活随笔為你收集整理的Python实现Flesch阅读易读性公式计算的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 每日英语:China Destroys
- 下一篇: oracle数据库本地连接报错:监听程序