python文本时间提取
生活随笔
收集整理的這篇文章主要介紹了
python文本时间提取
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
來源于 Python自然語言處理實戰
導入庫
import re from datetime import datetime, timedelta from dateutil.parser import parse import jieba.posseg as psgUTIL_CN_NUM = {'零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '0': 0, '1': 1, '2': 2,'3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9} UTIL_CN_UNIT = {'十': 10, '百': 100, '千': 1000, '萬': 10000}year2dig和cn2dig方法。主要是通過預定義一些模板,將具體的文本轉換成相應的數字,然后parse_datetime進行封裝。
def cn2dig(src):if src == "":return Nonem = re.match("\d+", src)if m:return int(m.group(0))rsl = 0unit = 1for item in src[::-1]:if item in UTIL_CN_UNIT.keys():unit = UTIL_CN_UNIT[item]elif item in UTIL_CN_NUM.keys():num = UTIL_CN_NUM[item]rsl += num * unitelse:return Noneif rsl < unit:rsl += unitreturn rsldef year2dig(year):res = ''for item in year:if item in UTIL_CN_NUM.keys():res = res + str(UTIL_CN_NUM[item])else:res = res + itemm = re.match("\d+", res)if m:if len(m.group(0)) == 2:return int(datetime.datetime.today().year / 100) * 100 + int(m.group(0))else:return int(m.group(0))else:return Noneparse_datetime進行封裝。
def parse_datetime(msg):if msg is None or len(msg) == 0:return Nonetry:dt = parse(msg, fuzzy=True)return dt.strftime('%Y-%m-%d %H:%M:%S')except Exception as e:m = re.match(r"([0-9零一二兩三四五六七八九十]+年)?([0-9一二兩三四五六七八九十]+月)?([0-9一二兩三四五六七八九十]+[號日])?([上中下午晚早]+)?([0-9零一二兩三四五六七八九十百]+[點:\.時])?([0-9零一二三四五六七八九十百]+分?)?([0-9零一二三四五六七八九十百]+秒)?",msg)if m.group(0) is not None:res = {"year": m.group(1),"month": m.group(2),"day": m.group(3),"hour": m.group(5) if m.group(5) is not None else '00',"minute": m.group(6) if m.group(6) is not None else '00',"second": m.group(7) if m.group(7) is not None else '00',}params = {}for name in res:if res[name] is not None and len(res[name]) != 0:tmp = Noneif name == 'year':tmp = year2dig(res[name][:-1])else:tmp = cn2dig(res[name][:-1])if tmp is not None:params[name] = int(tmp)target_date = datetime.today().replace(**params)is_pm = m.group(4)if is_pm is not None:if is_pm == u'下午' or is_pm == u'晚上' or is_pm == '中午':hour = target_date.time().hourif hour < 12:target_date = target_date.replace(hour=hour + 12)return target_date.strftime('%Y-%m-%d %H:%M:%S')else:return Nonecheck_time_valid函數,用來對提取的拼接日期串進行進一步處理,統一日 號等
def check_time_valid(word):m = re.match("\d+$", word)if m:if len(word) <= 6:return Noneword1 = re.sub('[號|日]\d+$', '日', word)if word1 != word:return check_time_valid(word1)else:return word1time_extract對句子進行解析,提取其中所有能表示日期時間的詞,并進行上下文拼接,如詞性標注完后出現“今天/t 住/v 到/v 明天/t 下午/t 3/m點/m”,那么需要將“今天”和“明天下午3點”提取出來。
# 時間提取 def time_extract(text):time_res = []word = ''keyDate = {'今天': 0, '明天': 1, '后天': 2}for k, v in psg.cut(text):if k in keyDate:if word != '':time_res.append(word)word = (datetime.today() + timedelta(days=keyDate.get(k, 0))).strftime('%Y{y}%m{m}%dze8trgl8bvbq').format(y='年',m='月',d='日')elif word != '':if v in ['m', 't']:word = word + kelse:time_res.append(word)word = ''elif v in ['m', 't']:word = kif word != '':time_res.append(word)result = list(filter(lambda x: x is not None, [check_time_valid(w) for w in time_res]))final_res = [parse_datetime(w) for w in result]return [x for x in final_res if x is not None]#全文
import re from datetime import datetime, timedelta from dateutil.parser import parse import jieba.posseg as psgUTIL_CN_NUM = {'零': 0, '一': 1, '二': 2, '兩': 2, '三': 3, '四': 4,'五': 5, '六': 6, '七': 7, '八': 8, '九': 9,'0': 0, '1': 1, '2': 2, '3': 3, '4': 4,'5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }UTIL_CN_UNIT = {'十': 10, '百': 100, '千': 1000, '萬': 10000}def cn2dig(src):if src == "":return Nonem = re.match("\d+", src)if m:return int(m.group(0))rsl = 0unit = 1for item in src[::-1]:if item in UTIL_CN_UNIT.keys():unit = UTIL_CN_UNIT[item]elif item in UTIL_CN_NUM.keys():num = UTIL_CN_NUM[item]rsl += num * unitelse:return Noneif rsl < unit:rsl += unitreturn rsldef year2dig(year):res = ''for item in year:if item in UTIL_CN_NUM.keys():res = res + str(UTIL_CN_NUM[item])else:res = res + itemm = re.match("\d+", res)if m:if len(m.group(0)) == 2:return int(datetime.datetime.today().year / 100) * 100 + int(m.group(0))else:return int(m.group(0))else:return Nonedef parse_datetime(msg):if msg is None or len(msg) == 0:return Nonetry:dt = parse(msg, fuzzy=True)return dt.strftime('%Y-%m-%d %H:%M:%S')except Exception as e:m = re.match(r"([0-9零一二兩三四五六七八九十]+年)?([0-9一二兩三四五六七八九十]+月)?([0-9一二兩三四五六七八九十]+[號日])?([上中下午晚早]+)?([0-9零一二兩三四五六七八九十百]+[點:\.時])?([0-9零一二三四五六七八九十百]+分?)?([0-9零一二三四五六七八九十百]+秒)?",msg)if m.group(0) is not None:res = {"year": m.group(1),"month": m.group(2),"day": m.group(3),"hour": m.group(5) if m.group(5) is not None else '00',"minute": m.group(6) if m.group(6) is not None else '00',"second": m.group(7) if m.group(7) is not None else '00',}params = {}for name in res:if res[name] is not None and len(res[name]) != 0:tmp = Noneif name == 'year':tmp = year2dig(res[name][:-1])else:tmp = cn2dig(res[name][:-1])if tmp is not None:params[name] = int(tmp)target_date = datetime.today().replace(**params)is_pm = m.group(4)if is_pm is not None:if is_pm == u'下午' or is_pm == u'晚上' or is_pm == '中午':hour = target_date.time().hourif hour < 12:target_date = target_date.replace(hour=hour + 12)return target_date.strftime('%Y-%m-%d %H:%M:%S')else:return Nonedef check_time_valid(word):m = re.match("\d+$", word)if m:if len(word) <= 6:return Noneword1 = re.sub('[號|日]\d+$', '日', word)if word1 != word:return check_time_valid(word1)else:return word1# 時間提取 def time_extract(text):time_res = []word = ''keyDate = {'今天': 0, '明天': 1, '后天': 2}for k, v in psg.cut(text):if k in keyDate:if word != '':time_res.append(word)word = (datetime.today() + timedelta(days=keyDate.get(k, 0))).strftime('%Y{y}%m{m}%dze8trgl8bvbq').format(y='年',m='月',d='日')elif word != '':if v in ['m', 't']:word = word + kelse:time_res.append(word)word = ''elif v in ['m', 't']:word = kif word != '':time_res.append(word)result = list(filter(lambda x: x is not None, [check_time_valid(w) for w in time_res]))final_res = [parse_datetime(w) for w in result]return [x for x in final_res if x is not None]text1 = '我要住到明天下午三點' print(text1, time_extract(text1), sep=':')text2 = '預定28號的房間' print(text2, time_extract(text2), sep=':')text3 = '我要從26號下午4點住到11月2號' print(text3, time_extract(text3), sep=':')text4 = '我要預訂今天到30的房間' print(text4, time_extract(text4), sep=':')寫于 2020 5 29
來源于 Python自然語言處理實戰
總結
以上是生活随笔為你收集整理的python文本时间提取的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 加息是什么意思
- 下一篇: leetcode111 爬楼梯 pyth