segmenter.go
生活随笔
收集整理的這篇文章主要介紹了
segmenter.go
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
//Go中文分詞package sego
import (????"bufio"????"fmt"????"log"????"math"????"os"????"strconv"????"strings"????"unicode"????"unicode/utf8")
const (????minTokenFrequency = 2 // 僅從字典文件中讀取大于等于此頻率的分詞)
// 分詞器結構體type Segmenter struct {????dict *Dictionary}
// 該結構體用于記錄Viterbi算法中某字元處的向前分詞跳轉信息type jumper struct {????minDistance float32????token *Token}
// 返回分詞器使用的詞典func (seg *Segmenter) Dictionary() *Dictionary {????return seg.dict}
// 從文件中載入詞典//// 可以載入多個詞典文件,文件名用","分隔,排在前面的詞典優先載入分詞,比如// ????"用戶詞典.txt,通用詞典.txt"// 當一個分詞既出現在用戶詞典也出現在通用詞典中,則優先使用用戶詞典。//// 詞典的格式為(每個分詞一行)://????分詞文本 頻率 詞性func (seg *Segmenter) LoadDictionary(files string) {????seg.dict = NewDictionary()????for _, file := range strings.Split(files, ",") {????????log.Printf("載入sego詞典 %s", file)????????dictFile, err := os.Open(file)????????defer dictFile.Close()????????if err != nil {????????????log.Fatalf("無法載入字典文件 \"%s\" \n", file)????????}
????????reader := bufio.NewReader(dictFile)????????var text string????????var freqText string????????var frequency int????????var pos string
????????// 逐行讀入分詞????????for {????????????size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)
????????????if size == 0 {????????????????// 文件結束????????????????break????????????} else if size < 2 {????????????????// 無效行????????????????continue????????????} else if size == 2 {????????????????// 沒有詞性標注時設為空字符串????????????????pos = ""????????????}
????????????// 解析詞頻????????????var err error????????????frequency, err = strconv.Atoi(freqText)????????????if err != nil {????????????????continue????????????}
????????????// 過濾頻率太小的詞????????????if frequency < minTokenFrequency {????????????????continue????????????}
????????????// 將分詞添加到字典中????????????words := splitTextToWords([]byte(text))????????????token := Token{text: words, frequency: frequency, pos: pos}????????????seg.dict.addToken(token)????????}????}
????// 計算每個分詞的路徑值,路徑值含義見Token結構體的注釋????logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))????for i := range seg.dict.tokens {????????token := &seg.dict.tokens[i]????????token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))????}
????// 對每個分詞進行細致劃分,用于搜索引擎模式,該模式用法見Token結構體的注釋。????for i := range seg.dict.tokens {????????token := &seg.dict.tokens[i]????????segments := seg.segmentWords(token.text, true)
????????// 計算需要添加的子分詞數目????????numTokensToAdd := 0????????for iToken := 0; iToken < len(segments); iToken++ {????????????if len(segments[iToken].token.text) > 1 {????????????????// 略去字元長度為一的分詞????????????????// TODO: 這值得進一步推敲,特別是當字典中有英文復合詞的時候????????????????numTokensToAdd++????????????}????????}????????token.segments = make([]*Segment, numTokensToAdd)
????????// 添加子分詞????????iSegmentsToAdd := 0????????for iToken := 0; iToken < len(segments); iToken++ {????????????if len(segments[iToken].token.text) > 1 {????????????????token.segments[iSegmentsToAdd] = &segments[iToken]????????????????iSegmentsToAdd++????????????}????????}????}
????log.Println("sego詞典載入完畢")}
// 對文本分詞//// 輸入參數://????bytes????UTF8文本的字節數組//// 輸出://????[]Segment????劃分的分詞func (seg *Segmenter) Segment(bytes []byte) []Segment {????return seg.internalSegment(bytes, false)}
func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {????// 處理特殊情況????if len(bytes) == 0 {????????return []Segment{}????}
????// 劃分字元????text := splitTextToWords(bytes)
????return seg.segmentWords(text, searchMode)}
func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {????// 搜索模式下該分詞已無繼續劃分可能的情況????if searchMode && len(text) == 1 {????????return []Segment{}????}
????// jumpers定義了每個字元處的向前跳轉信息,包括這個跳轉對應的分詞,????// 以及從文本段開始到該字元的最短路徑值????jumpers := make([]jumper, len(text))
????tokens := make([]*Token, seg.dict.maxTokenLength)????for current := 0; current < len(text); current++ {????????// 找到前一個字元處的最短路徑,以便計算后續路徑值????????var baseDistance float32????????if current == 0 {????????????// 當本字元在文本首部時,基礎距離應該是零????????????baseDistance = 0????????} else {????????????baseDistance = jumpers[current-1].minDistance????????}
????????// 尋找所有以當前字元開頭的分詞????????numTokens := seg.dict.lookupTokens(????????????text[current:minInt(current+seg.dict.maxTokenLength, len(text))], tokens)
????????// 對所有可能的分詞,更新分詞結束字元處的跳轉信息????????for iToken := 0; iToken < numTokens; iToken++ {????????????location := current + len(tokens[iToken].text) - 1????????????if !searchMode || current != 0 || location != len(text)-1 {????????????????updateJumper(&jumpers[location], baseDistance, tokens[iToken])????????????}????????}
????????// 當前字元沒有對應分詞時補加一個偽分詞????????if numTokens == 0 || len(tokens[0].text) > 1 {????????????updateJumper(&jumpers[current], baseDistance,????????????????&Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})????????}????}
????// 從后向前掃描第一遍得到需要添加的分詞數目????numSeg := 0????for index := len(text) - 1; index >= 0; {????????location := index - len(jumpers[index].token.text) + 1????????numSeg++????????index = location - 1????}
????// 從后向前掃描第二遍添加分詞到最終結果????outputSegments := make([]Segment, numSeg)????for index := len(text) - 1; index >= 0; {????????location := index - len(jumpers[index].token.text) + 1????????numSeg--????????outputSegments[numSeg].token = jumpers[index].token????????index = location - 1????}
????// 計算各個分詞的字節位置????bytePosition := 0????for iSeg := 0; iSeg < len(outputSegments); iSeg++ {????????outputSegments[iSeg].start = bytePosition????????bytePosition += textSliceByteLength(outputSegments[iSeg].token.text)????????outputSegments[iSeg].end = bytePosition????}????return outputSegments}
// 更新跳轉信息:// ????1. 當該位置從未被訪問過時(jumper.minDistance為零的情況),或者//????2. 當該位置的當前最短路徑大于新的最短路徑時// 將當前位置的最短路徑值更新為baseDistance加上新分詞的概率func updateJumper(jumper *jumper, baseDistance float32, token *Token) {????newDistance := baseDistance + token.distance????if jumper.minDistance == 0 || jumper.minDistance > newDistance {????????jumper.minDistance = newDistance????????jumper.token = token????}}
// 取兩整數較小值func minInt(a, b int) int {????if a > b {????????return b????}????return a}
// 取兩整數較大值func maxInt(a, b int) int {????if a > b {????????return a????}????return b}
// 將文本劃分成字元func splitTextToWords(text Text) []Text {????output := make([]Text, 0, len(text)/3)????current := 0????inAlphanumeric := true????alphanumericStart := 0????for current < len(text) {????????r, size := utf8.DecodeRune(text[current:])????????if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {????????????// 當前是拉丁字母或數字(非中日韓文字)????????????if !inAlphanumeric {????????????????alphanumericStart = current????????????????inAlphanumeric = true????????????}????????} else {????????????if inAlphanumeric {????????????????inAlphanumeric = false????????????????if current != 0 {????????????????????output = append(output, toLower(text[alphanumericStart:current]))????????????????}????????????}????????????output = append(output, text[current:current+size])????????}????????current += size????}
????// 處理最后一個字元是英文的情況????if inAlphanumeric {????????if current != 0 {????????????output = append(output, toLower(text[alphanumericStart:current]))????????}????}
????return output}
// 將英文詞轉化為小寫func toLower(text []byte) []byte {????output := make([]byte, len(text))????for i, t := range text {????????if t >= 'A' && t <= 'Z' {????????????output[i] = t - 'A' + 'a'????????} else {????????????output[i] = t????????}????}????return output}
import (????"bufio"????"fmt"????"log"????"math"????"os"????"strconv"????"strings"????"unicode"????"unicode/utf8")
const (????minTokenFrequency = 2 // 僅從字典文件中讀取大于等于此頻率的分詞)
// 分詞器結構體type Segmenter struct {????dict *Dictionary}
// 該結構體用于記錄Viterbi算法中某字元處的向前分詞跳轉信息type jumper struct {????minDistance float32????token *Token}
// 返回分詞器使用的詞典func (seg *Segmenter) Dictionary() *Dictionary {????return seg.dict}
// 從文件中載入詞典//// 可以載入多個詞典文件,文件名用","分隔,排在前面的詞典優先載入分詞,比如// ????"用戶詞典.txt,通用詞典.txt"// 當一個分詞既出現在用戶詞典也出現在通用詞典中,則優先使用用戶詞典。//// 詞典的格式為(每個分詞一行)://????分詞文本 頻率 詞性func (seg *Segmenter) LoadDictionary(files string) {????seg.dict = NewDictionary()????for _, file := range strings.Split(files, ",") {????????log.Printf("載入sego詞典 %s", file)????????dictFile, err := os.Open(file)????????defer dictFile.Close()????????if err != nil {????????????log.Fatalf("無法載入字典文件 \"%s\" \n", file)????????}
????????reader := bufio.NewReader(dictFile)????????var text string????????var freqText string????????var frequency int????????var pos string
????????// 逐行讀入分詞????????for {????????????size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)
????????????if size == 0 {????????????????// 文件結束????????????????break????????????} else if size < 2 {????????????????// 無效行????????????????continue????????????} else if size == 2 {????????????????// 沒有詞性標注時設為空字符串????????????????pos = ""????????????}
????????????// 解析詞頻????????????var err error????????????frequency, err = strconv.Atoi(freqText)????????????if err != nil {????????????????continue????????????}
????????????// 過濾頻率太小的詞????????????if frequency < minTokenFrequency {????????????????continue????????????}
????????????// 將分詞添加到字典中????????????words := splitTextToWords([]byte(text))????????????token := Token{text: words, frequency: frequency, pos: pos}????????????seg.dict.addToken(token)????????}????}
????// 計算每個分詞的路徑值,路徑值含義見Token結構體的注釋????logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))????for i := range seg.dict.tokens {????????token := &seg.dict.tokens[i]????????token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))????}
????// 對每個分詞進行細致劃分,用于搜索引擎模式,該模式用法見Token結構體的注釋。????for i := range seg.dict.tokens {????????token := &seg.dict.tokens[i]????????segments := seg.segmentWords(token.text, true)
????????// 計算需要添加的子分詞數目????????numTokensToAdd := 0????????for iToken := 0; iToken < len(segments); iToken++ {????????????if len(segments[iToken].token.text) > 1 {????????????????// 略去字元長度為一的分詞????????????????// TODO: 這值得進一步推敲,特別是當字典中有英文復合詞的時候????????????????numTokensToAdd++????????????}????????}????????token.segments = make([]*Segment, numTokensToAdd)
????????// 添加子分詞????????iSegmentsToAdd := 0????????for iToken := 0; iToken < len(segments); iToken++ {????????????if len(segments[iToken].token.text) > 1 {????????????????token.segments[iSegmentsToAdd] = &segments[iToken]????????????????iSegmentsToAdd++????????????}????????}????}
????log.Println("sego詞典載入完畢")}
// 對文本分詞//// 輸入參數://????bytes????UTF8文本的字節數組//// 輸出://????[]Segment????劃分的分詞func (seg *Segmenter) Segment(bytes []byte) []Segment {????return seg.internalSegment(bytes, false)}
func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {????// 處理特殊情況????if len(bytes) == 0 {????????return []Segment{}????}
????// 劃分字元????text := splitTextToWords(bytes)
????return seg.segmentWords(text, searchMode)}
func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {????// 搜索模式下該分詞已無繼續劃分可能的情況????if searchMode && len(text) == 1 {????????return []Segment{}????}
????// jumpers定義了每個字元處的向前跳轉信息,包括這個跳轉對應的分詞,????// 以及從文本段開始到該字元的最短路徑值????jumpers := make([]jumper, len(text))
????tokens := make([]*Token, seg.dict.maxTokenLength)????for current := 0; current < len(text); current++ {????????// 找到前一個字元處的最短路徑,以便計算后續路徑值????????var baseDistance float32????????if current == 0 {????????????// 當本字元在文本首部時,基礎距離應該是零????????????baseDistance = 0????????} else {????????????baseDistance = jumpers[current-1].minDistance????????}
????????// 尋找所有以當前字元開頭的分詞????????numTokens := seg.dict.lookupTokens(????????????text[current:minInt(current+seg.dict.maxTokenLength, len(text))], tokens)
????????// 對所有可能的分詞,更新分詞結束字元處的跳轉信息????????for iToken := 0; iToken < numTokens; iToken++ {????????????location := current + len(tokens[iToken].text) - 1????????????if !searchMode || current != 0 || location != len(text)-1 {????????????????updateJumper(&jumpers[location], baseDistance, tokens[iToken])????????????}????????}
????????// 當前字元沒有對應分詞時補加一個偽分詞????????if numTokens == 0 || len(tokens[0].text) > 1 {????????????updateJumper(&jumpers[current], baseDistance,????????????????&Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})????????}????}
????// 從后向前掃描第一遍得到需要添加的分詞數目????numSeg := 0????for index := len(text) - 1; index >= 0; {????????location := index - len(jumpers[index].token.text) + 1????????numSeg++????????index = location - 1????}
????// 從后向前掃描第二遍添加分詞到最終結果????outputSegments := make([]Segment, numSeg)????for index := len(text) - 1; index >= 0; {????????location := index - len(jumpers[index].token.text) + 1????????numSeg--????????outputSegments[numSeg].token = jumpers[index].token????????index = location - 1????}
????// 計算各個分詞的字節位置????bytePosition := 0????for iSeg := 0; iSeg < len(outputSegments); iSeg++ {????????outputSegments[iSeg].start = bytePosition????????bytePosition += textSliceByteLength(outputSegments[iSeg].token.text)????????outputSegments[iSeg].end = bytePosition????}????return outputSegments}
// 更新跳轉信息:// ????1. 當該位置從未被訪問過時(jumper.minDistance為零的情況),或者//????2. 當該位置的當前最短路徑大于新的最短路徑時// 將當前位置的最短路徑值更新為baseDistance加上新分詞的概率func updateJumper(jumper *jumper, baseDistance float32, token *Token) {????newDistance := baseDistance + token.distance????if jumper.minDistance == 0 || jumper.minDistance > newDistance {????????jumper.minDistance = newDistance????????jumper.token = token????}}
// 取兩整數較小值func minInt(a, b int) int {????if a > b {????????return b????}????return a}
// 取兩整數較大值func maxInt(a, b int) int {????if a > b {????????return a????}????return b}
// 將文本劃分成字元func splitTextToWords(text Text) []Text {????output := make([]Text, 0, len(text)/3)????current := 0????inAlphanumeric := true????alphanumericStart := 0????for current < len(text) {????????r, size := utf8.DecodeRune(text[current:])????????if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {????????????// 當前是拉丁字母或數字(非中日韓文字)????????????if !inAlphanumeric {????????????????alphanumericStart = current????????????????inAlphanumeric = true????????????}????????} else {????????????if inAlphanumeric {????????????????inAlphanumeric = false????????????????if current != 0 {????????????????????output = append(output, toLower(text[alphanumericStart:current]))????????????????}????????????}????????????output = append(output, text[current:current+size])????????}????????current += size????}
????// 處理最后一個字元是英文的情況????if inAlphanumeric {????????if current != 0 {????????????output = append(output, toLower(text[alphanumericStart:current]))????????}????}
????return output}
// 將英文詞轉化為小寫func toLower(text []byte) []byte {????output := make([]byte, len(text))????for i, t := range text {????????if t >= 'A' && t <= 'Z' {????????????output[i] = t - 'A' + 'a'????????} else {????????????output[i] = t????????}????}????return output}
轉載于:https://www.cnblogs.com/zhangboyu/p/7462003.html
總結
以上是生活随笔為你收集整理的segmenter.go的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Java程序员应该了解的10个设计原则
- 下一篇: 【习题 5-8 UVA - 230】Bo