2016年11月30日 星期三

Sentiment Analyzer (Spring Boot)

Sentiment Analyzer (Spring Boot)
進行情緒分析時,需要有斷詞工具,透過斷字斷詞擷取出關鍵字
再透過關鍵字去判斷為正面或負面情緒,因此還需定義正負評價詞庫
最後透過計算正負關鍵字出現次數來決定該文的正負評


1.中文斷字斷詞
本文採用mmseg4j作為中文的斷字斷詞,會使用mmseg4j原因是可自訂詞庫

可參考
mmseg4j 中文斷詞java 實作:http://ift.tt/2oFGaL9
中文分詞器性能比較:http://ift.tt/SrFUiJ

1.1 詞庫目錄
首先在src\main\resources建立詞庫檔案



詞庫定義如下

1.2 載入多個詞庫
如下方Line 68指定詞庫所在路徑,將會自動載入words***.dic的詞庫檔案(words開頭.dic結尾)
目的為將詞庫分類,較容易管理與擴充,Line 62路徑即為 1.1詞庫目錄

1.3 測試斷詞功能
測試文句:
疾管署今天表示,9月在高雄國際航空站捕到漢他病毒鼠,但沒有即時告知。雖未釀成疫情,疾管署承認疏失,將檢討內部流程並相關處分。



org.iwlp.controller.SemanticAnalysisController
    @RequestMapping(method=RequestMethod.POST,value="/segment")
    @ApiOperation(value = "中文斷詞",notes="輸入中文斷詞處理的文章內容,檢視斷詞結果")
    @ApiImplicitParams({
        @ApiImplicitParam(name = "text", value = "text", required = true, defaultValue = "這行文字是要被中文斷詞處理的文章,可以從執行結果看斷詞是否成功",dataType = "string", paramType = "query"),
        @ApiImplicitParam(name = "fieldType", value = "fieldType", required = false, defaultValue = "ComplexSeg",dataType = "string", paramType = "query"),
        @ApiImplicitParam(name = "isRefresh", value = "是否刷新字典檔", required = true, defaultValue = "false",dataType = "boolean", paramType = "query")
        })
    public Object segment(
            @RequestParam(required=true, value = "text") String text,
            @RequestParam(required = false, value = "fieldType") FieldTypeEnum fieldType,
            @RequestParam(required=true, value = "isRefresh") Boolean isRefresh
            ) {
        String result = null;
        Segment segment = new Segment(isRefresh);
        if(fieldType == null){
            fieldType = FieldTypeEnum.ComplexSeg;
        }
        try {
            result = segment.segWords(text, " | ", fieldType);
        } catch (IOException e) {
            e.printStackTrace();
        }
        TextBody textBody = new TextBody();
        textBody.setText(text);
        return result;
    }
疾 | 管 | 署 | 今天 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕 | 到 | 漢 | 他 | 病毒 | 鼠 | 但沒 | 有 | 即時 | 告知 | 雖未 | 釀成 | 疫情 | 疾 | 管 | 署 | 承認 | 疏失 | 將 | 檢討 | 內部 | 流程 | 並 | 相關 | 處分


1.4 調整詞庫
src\main\resources\data\words-lexicon.dic
最後一行加入自訂詞庫"疾管署", "漢他病毒"

輸出結果
疾管署 | 今天 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕 | 到 | 漢他病毒 | 鼠 | 但沒 | 有 | 即時 | 告知 | 雖未 | 釀成 | 疫情 | 疾管署 | 承認 | 疏失 | 將 | 檢討 | 內部 | 流程 | 並 | 相關 | 處分

   
1.5 斷詞(分詞)完整程式碼

package org.iwlp.core;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.iwlp.model.FieldTypeEnum;
import org.iwlp.model.result.KeyWordFrequency;
import org.iwlp.model.result.KeyWordsStatistic;
import org.iwlp.utils.MapSortingUtils;
import org.iwlp.utils.StreamUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.chenlb.mmseg4j.ComplexSeg;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MMSeg;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.Seg;
import com.chenlb.mmseg4j.SimpleSeg;
import com.chenlb.mmseg4j.Word;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;

/**
 * 程式資訊摘要:分詞<P>
 * 類別名稱  :Segment.java<P>
 * 程式內容說明:<P>
 * 程式修改記錄:<P>
 * date:2016年9月8日<P>
 *@author A50319
 *@version 1.0
 *@since 1.0
 */
public class Segment {
    private Dictionary dic;
    private static final Logger log = LoggerFactory.getLogger(Segment.class);

    /**
     * Segment建構子
     * @param isRefresh : 是否重新刷新Dictionary
     */
    public Segment(boolean isRefresh) {
        initDictionary(isRefresh);
    }

    private void initDictionary(boolean isRefresh){
        String filePath = StreamUtils.getResourceURL(Segment.class, "data").getFile();
        if(isRefresh){
            Dictionary.clear(filePath);
            log.debug("path:{}", filePath);
            log.debug("Dictionary refresh sataus:{}", isRefresh);
        }
        System.setProperty("mmseg.dic.path", filePath);    //這裡可以指定自訂詞庫
        dic = Dictionary.getInstance();
    }

    /**
     * 預設SEG
     * @return ComplexSeg
     */
    private Seg getSeg() {
        return new ComplexSeg(dic);
    }

    private Seg getSeg(FieldTypeEnum fieldType) {
        Seg seg = null;
//        log.debug("fieldType:{}", fieldType.name());
       if(fieldType == FieldTypeEnum.SimpleSeg){
//           log.debug("SimpleSeg:{}", FieldTypeEnum.SimpleSeg.name());
           seg = new SimpleSeg(dic);
       } else if(fieldType == FieldTypeEnum.ComplexSeg){
//           log.debug("ComplexSeg:{}", FieldTypeEnum.ComplexSeg.name());
           seg = new ComplexSeg(dic);
       } else if(fieldType == FieldTypeEnum.MaxWordSeg){
//           log.debug("MaxWordSeg:{}", FieldTypeEnum.MaxWordSeg.name());
           seg = new MaxWordSeg(dic);
       } else{
           seg = getSeg();
       }
        return seg;
    }

    /**
     * 斷詞(預設ComplexSeg分詞器)
     * @param text : 內文
     * @param wordSpilt : 分割標記文字
     * @return String
     * @throws IOException
     */
    public String segWords(String text, String wordSpilt) throws IOException {
        Reader inputReader = new StringReader(text);
        StringBuilder sb = new StringBuilder();
        Seg seg = getSeg();
        MMSeg mmSeg = new MMSeg(inputReader, seg);
        Word word = null;
        boolean first = true;
        while((word=mmSeg.next())!=null) {
            if(!first) {
                sb.append(wordSpilt);
            }
            String w = word.getString();
            sb.append(w);
            first = false;

        }
        return sb.toString();
    }

    /**
     * 斷詞
     * @param text : 內文
     * @param wordSpilt : 分割標記文字
     * @param fieldType : 分詞器類型
     * @return String
     * @throws IOException
     */
    public String segWords(String text, String wordSpilt, FieldTypeEnum fieldType) throws IOException {
        Reader inputReader = new StringReader(text);
        StringBuilder sb = new StringBuilder();
        Seg seg = getSeg(fieldType);
        MMSeg mmSeg = new MMSeg(inputReader, seg);
        Word word = null;
        boolean first = true;
        while((word=mmSeg.next())!=null) {
            if(!first) {
                sb.append(wordSpilt);
            }
            String w = word.getString();
            sb.append(w);
            first = false;

        }
        return sb.toString();
    }

    /**
     * 斷詞(預設ComplexSeg分詞器)
     * @param text : 內文
     * @return List<String> words
     */
    public List<String> segWords(String text) {
        List<String> words = new ArrayList<String>();
        Reader inputReader = new StringReader(text);
        Seg seg = getSeg();
        MMSeg mmSeg = new MMSeg(inputReader, seg);
        Word word = null;
        try {
            while((word=mmSeg.next())!=null) {
                words.add(word.getString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally{
            IOUtils.closeQuietly(inputReader);
        }
        return words;
    }

    /**
     * 斷詞
     * @param text : 內文
     * @param fieldType : 分詞器類型
     * @return List<String> words
     */
    public List<String> segWords(String text, FieldTypeEnum fieldType) {
        List<String> words = new ArrayList<String>();
        Reader inputReader = new StringReader(text);
        Seg seg = getSeg(fieldType);
        MMSeg mmSeg = new MMSeg(inputReader, seg);
        Word word = null;
        try {
            while((word=mmSeg.next())!=null) {
                words.add(word.getString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally{
            IOUtils.closeQuietly(inputReader);
        }
        return words;
    }


    /**
     * 計算關鍵字出現頻率
     * @param text : 內文
     * @param fieldType : 分詞器類型
     * @return Map<String, Integer>
     */
    public Map<String, Integer> calculateKeywordsMap(String text, FieldTypeEnum fieldType){
        Map<String, Integer> keywords = new LinkedHashMap<String, Integer>();
        Map<String, Object> tempKeywords = new HashMap<String, Object>();
        List<String> words = segWords(text, fieldType);
        for(String keyword : words){
            if(tempKeywords.containsKey(keyword)){
                int count = ((int) tempKeywords.get(keyword)) + 1;
                tempKeywords.put(keyword, count);
            } else{
                tempKeywords.put(keyword, 1);
            }
        }
        Gson gson = new GsonBuilder().create();
//        log.debug("sort before:\n{}",gson.toJson(MapSortingUtils.sortMapByValue(tempKeywords)));
        tempKeywords = MapSortingUtils.reverseMap(MapSortingUtils.sortMapByValue(tempKeywords));
//        log.debug("sort affter:\n{}",gson.toJson(tempKeywords));
        for(Entry<String, Object> entry : tempKeywords.entrySet()){
            String key = entry.getKey();
            Integer value = (Integer) entry.getValue();
            keywords.put(key, value);
//            log.debug("key:{}, value:{}",key, value);
        }

        tempKeywords.clear();
        tempKeywords = null;

        return keywords;
    }

    /**
     * 計算關鍵字出現頻率
     * @param text : 內文
     * @param fieldType : 分詞器類型
     * @return KeyWordsStatistic
     */
    public KeyWordsStatistic calculateKeywords(String text, FieldTypeEnum fieldType){
        KeyWordsStatistic ks = new KeyWordsStatistic();
        Map<String, Integer> keywords = calculateKeywordsMap(text, fieldType);
        ks.setTotal(keywords.size());
        List<KeyWordFrequency> kfs = new ArrayList<KeyWordFrequency>();
        for(Entry<String, Integer> entry : keywords.entrySet()){
            String key = entry.getKey();
            Integer value = (Integer) entry.getValue();
            KeyWordFrequency kf = new KeyWordFrequency();
            kf.setKeyWord(key);
            kf.setCount(value);
            kfs.add(kf);
        }
        ks.setResult(kfs);
        return ks;
    }

    /**
     * 計算關鍵字出現頻率
     * @param text : 內文
     * @param fieldType : 分詞器類型
     * @param minKeywordLength : 關鍵字最小長度門檻
     * @param minFrequency : 關鍵字出現最小次數門檻
     * @return KeyWordsStatistic
     */
    public KeyWordsStatistic calculateKeywords(String text, FieldTypeEnum fieldType, int minKeywordLength, int minFrequency){
        KeyWordsStatistic ks = new KeyWordsStatistic();
        Map<String, Integer> keywords = calculateKeywordsMap(text, fieldType);
        List<KeyWordFrequency> kfs = new ArrayList<KeyWordFrequency>();
        for(Entry<String, Integer> entry : keywords.entrySet()){
            String key = entry.getKey();
            Integer value = (Integer) entry.getValue();
            if(key.length() >= minKeywordLength){
                if(value >= minFrequency){
                    KeyWordFrequency kf = new KeyWordFrequency();
                    kf.setKeyWord(key);
                    kf.setCount(value);
                    kfs.add(kf);
                }
            }
        }
        ks.setResult(kfs);
        ks.setTotal(ks.getResult().size());
        return ks;
    }

}


2.關鍵字統計
此功能目的為統計詞庫檔案內關鍵字於文章中出現的頻率,只要斷字斷詞在統計出現次數即可
minKeywordLength:關鍵字最小文字長度,若值為3,則"疾管署 | 承認 | 疏失",僅出現長度為3的關鍵字"疾管署"
minFrequency:關鍵字出現頻率,篩選關鍵字出現次數

2.1 測試關鍵字統計

org.iwlp.controller.SemanticAnalysisController
@RequestMapping(method=RequestMethod.POST,value="/keywords")
    @ApiOperation(value = "關鍵字統計",notes="根據文章內容統計關鍵字出現次數")
    @ApiImplicitParams({
        @ApiImplicitParam(name = "isRefresh", value = "是否刷新字典檔", required = true, defaultValue = "false",dataType = "boolean", paramType = "query"),
        @ApiImplicitParam(name = "fieldType", value = "fieldType", required = false, defaultValue = "ComplexSeg",dataType = "string", paramType = "query"),
        @ApiImplicitParam(name = "minKeywordLength", value = "關鍵字最小文字長度", required = false, defaultValue = "1",dataType = "integer", paramType = "query"),
        @ApiImplicitParam(name = "minFrequency", value = "關鍵字出現頻率", required = false, defaultValue = "1",dataType = "integer", paramType = "query")
        })
    public KeyWordsStatistic calculateKeywords(
            @RequestParam(required=true, value = "isRefresh") Boolean isRefresh,
            @RequestParam(required = false, value = "fieldType") FieldTypeEnum fieldType,
            @RequestParam(required = false, value = "minKeywordLength") Integer minKeywordLength,
            @RequestParam(required = false, value = "minFrequency") Integer minFrequency,
           @RequestBody(required=true) final TextBody body
            ) {
        if(minKeywordLength == null){
            minKeywordLength = 1;
        }
        if(minFrequency == null){
            minFrequency = 1;
        }
        AssertUtils.isMoreThan(minKeywordLength, 1);
        AssertUtils.isMoreThan(minFrequency, 1);
        TextBody textBody = gson.fromJson(gson.toJson(body), TextBody.class);
        Segment segment = new Segment(isRefresh);
        if(fieldType == null){
            fieldType = FieldTypeEnum.ComplexSeg;
        }
        KeyWordsStatistic kws = segment.calculateKeywords(textBody.getText(), fieldType, minKeywordLength, minFrequency);
        return kws;
    }
{"total":29,"result":[{"keyword":"疾管署","count":2},{"keyword":"高雄","count":1},{"keyword":"9","count":1},{"keyword":"疫情","count":1},{"keyword":"到","count":1},{"keyword":"內部","count":1},{"keyword":"相關","count":1},{"keyword":"在","count":1},{"keyword":"流程","count":1},{"keyword":"並","count":1},{"keyword":"即時","count":1},{"keyword":"承認","count":1},{"keyword":"檢討","count":1},{"keyword":"鼠","count":1},{"keyword":"漢他病毒","count":1},{"keyword":"站","count":1},{"keyword":"國際航空","count":1},{"keyword":"今天","count":1},{"keyword":"捕","count":1},{"keyword":"告知","count":1},{"keyword":"疏失","count":1},{"keyword":"有","count":1},{"keyword":"月","count":1},{"keyword":"將","count":1},{"keyword":"雖未","count":1},{"keyword":"但沒","count":1},{"keyword":"釀成","count":1},{"keyword":"表示","count":1},{"keyword":"處分","count":1}]}


3.情緒詞庫
本文中情緒分析採用作者der3318所開發之情緒分析,並加以修改
根據第2章之中文斷詞機制將文章的關鍵字分離出來,並判斷這些關鍵字代表之情緒為正面或負面
經統計後可得情緒分析分數

可參考
SentimentAnalyzer:http://ift.tt/2oG3Yi3
內建採用結巴(jieba)中文斷詞,此處修改為mmseg4j,原因是原先系統已採用mmseg4j

3.1 情緒詞庫目錄
最主要為定義正面詞庫(positive.txt)以及負面詞庫(negative.txt)



positive.txt


negative.txt


3.2 中文斷詞設定
此處故意沒刪除結巴(jieba)中文斷詞之處,請參閱綠色被註解的地方
org.iwlp.core.analyzer.SegChinese
package org.iwlp.core.analyzer;
import java.io.IOException;
import java.util.ArrayList;
import org.iwlp.core.Segment;
import org.iwlp.model.FieldTypeEnum;
//import com.huaban.analysis.jieba.JiebaSegmenter;
//import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
//import com.huaban.analysis.jieba.SegToken;
public class SegChinese {
       // a static Segmenter shared by all analyzers
       private static SegChinese seg;
       
       //mmseg4j
       private Segment segmenter;
       public static boolean isRefresh;
       
       //結巴分詞
//     protected JiebaSegmenter segmenter;
//     public SegChinese() {
//           segmenter = new JiebaSegmenter();
//     }
       public SegChinese(boolean isRefresh) {
        segmenter = new Segment(isRefresh);
    }
       
       // return the prepared Segmenter. if not found, create one
       public static SegChinese getInstance() {
        if (seg == null) {
            synchronized (SegChinese.class) {
                if (seg == null) {
                    seg = new SegChinese(isRefresh);
                    return seg;
                }
            }
        }
        return seg;
    }
       public static void removeInstance() {
           seg = null;
       }
       
       public ArrayList<String> getSegList(String text) throws IOException {
             
             return (ArrayList<String>) segmenter.segWords(text, FieldTypeEnum.ComplexSeg);
       }
       public String segWords(String text, String wordSpilt) throws IOException {
             return segmenter.segWords(text, wordSpilt);
       }
       
       
//     public ArrayList<String> getSegList(String txt) throws IOException {
//        ArrayList<String> output = new ArrayList<String>();
//        for( SegToken token : segmenter.process(txt, SegMode.INDEX) )   if( !token.word.isEmpty() ) output.add(token.word);
//        return output;
//    }
//
//    public String segWords(String txt, String wordSpilt) throws IOException {
//        String output = new String("");
//        for(  SegToken token : segmenter.process(txt, SegMode.INDEX) )  output += (token.word + wordSpilt);
//        return output;
//    }
}

3.3 情緒分析流程
情緒計算方式為將文章

疾管署今天表示,9月在高雄國際航空站捕到漢他病毒鼠,但沒有即時告知。雖未釀成疫情,疾管署承認疏失,將檢討內部流程並相關處分。衛生福利部疾病管制署今天發布新聞稿表示,9月在高雄國際航空站捕獲錢鼠送實驗室檢驗檢出漢他病毒陽性,當時並未立即通知航空站,日前與航站定期召開衛生小組會議時,才告知並要求加強環境清潔與防鼠措施。

用以設定之中文斷詞進行斷句斷詞

疾管署 | 今天 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕 | 到 | 漢他病毒 | 鼠 | 但沒 | 有 | 即時 | 告知 | 雖未 | 釀成 | 疫情 | 疾管署 | 承認 | 疏失 | 將 | 檢討 | 內部 | 流程 | 並 | 相關 | 處分 | 衛生 | 福利 | 部 | 疾病 | 管制 | 署 | 今天 | 發 | 布 | 新聞稿 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕獲 | 錢鼠 | 送 | 實驗室 | 檢驗 | 檢出 | 漢他病毒 | 陽性 | 當時 | 並未 | 立即 | 通知 | 航空站 | 日前 | 與 | 航 | 站 | 定期 | 召開 | 衛生 | 小組 | 會議 | 時 | 才 | 告知 | 並要 | 求 | 加強 | 環境 | 清潔 | 與 | 防鼠 | 措施

之後再進行正負情緒關鍵字之初始化,即正面詞庫分數定為1分,負面詞庫定為-1分 ,此處將英文字母設定為轉為小寫
package org.iwlp.core.analyzer;

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SentimentalDictionary {
    private static final Logger log = LoggerFactory.getLogger(SentimentalDictionary.class);
    // a static dictionary shared by all analyzers 
    private static SentimentalDictionary dictionary;
    // filenames
    private static String filenameP = new String("./docs/positive.txt");
    private static String filenameN = new String("./docs/negative.txt");
    private static String filenameADV = new String("./docs/adv.txt");

    // a HashMap holding sentimental words as keys
    private HashMap<String, Integer> mydict = new HashMap<String, Integer>();
    // a HashMap holding adverbs as keys
    private HashMap<String, Boolean> myadv = new HashMap<String, Boolean>();

    // return the prepared dictionary. if not found, create one
    public static SentimentalDictionary getInstance() {
        if (dictionary == null) {
            synchronized (SentimentalDictionary.class) {
                if (dictionary == null) {
                    dictionary = new SentimentalDictionary();
                    dictionary.makeDict();
                    return dictionary;
                }
            }
        }
        return dictionary;
    }

    // remove the current dictionary due to some setting changes
    public static void removeInstance() {
        dictionary = null;
    }

    public static void setFilename(String _filenameP, String _filenameN, String _filenameADV) {
        filenameP = _filenameP;
        filenameN = _filenameN;
        filenameADV = _filenameADV;
    }

    // add a positive word into dictionary
    public synchronized void addPositiveWords(String _string) {
        if( mydict.containsKey(_string) )    mydict.put(_string, mydict.get(_string) + 1);
        else    mydict.put(_string, 1);
    }

    // add a negative word into dictionary
    public synchronized void addNegativeWords(String _string) {
        if( mydict.containsKey(_string) )    mydict.put(_string, mydict.get(_string) - 1);
        else    mydict.put(_string, -1);
    }

    // get the score of the sentimental word, and return 0 when not found
    public int checkWord(String _string) {
        if( _string.isEmpty() || !mydict.containsKey(_string) )    return 0;
        if( mydict.get(_string) > 0 )    return 1;
        return -1;
    }

    // check if the input word is adv or not
    public boolean checkAdv(String _string) {
        if( myadv.containsKey(_string) && !_string.isEmpty() )    return true;
        return false;
    }

    // return an ArrayList containing positive words
    public ArrayList<String> getPositiveWords() {
        ArrayList<String> output_list = new ArrayList<String>();
        for( String key : mydict.keySet() )    if( mydict.get(key) > 0 )    output_list.add(key);
        return output_list;
    }

    // return an ArrayList containing negative words
    public ArrayList<String> getNegativeWords() {
        ArrayList<String> output_list = new ArrayList<String>();
        for( String key : mydict.keySet() )    if( mydict.get(key) < 0 )    output_list.add(key);
        return output_list;
    }

    // get the size(numbers of words) of the dictionary
    public int getSize() {
        return mydict.size() + myadv.size();
    }

    // print dictionary
    public void printDict() {
        for( String key : mydict.keySet() )    System.out.println(key + ", " + mydict.get(key));
    }

    // put the words into the the HashMaps from 3 input files(positive sentimental words, negative sentimental words, adverbs)
    public void makeDict() {        
        try {
            // access positive words
            String[] filenames = {filenameP, /*"./docs/pos_by_training.txt"*/};
            for(String filename : filenames) {
                log.debug("Accessing :{}" , filename);
                FileReader fr = new FileReader(filename);
                BufferedReader br = new BufferedReader(fr);
                String tmp = br.readLine();
                while(tmp != null) {
                    //新增正面詞庫(英文自動轉小寫)
                    addPositiveWords( tmp.trim().toLowerCase() );
                    tmp = br.readLine();
                }
                br.close();
            }    
        }
        catch (Exception e) {
            System.out.println("File of Positive Words Not Found");
            e.printStackTrace();
        }
        try {
            // access negative words
            String[] filenames = {filenameN/*, "./docs/neg_by_training.txt"*/};
            for(String filename : filenames) {
//                System.out.println("Accessing " + filename);
                log.debug("Accessing:{}", filename);
                FileReader fr = new FileReader(filename);
                BufferedReader br = new BufferedReader(fr);
                String tmp = br.readLine();
                while(tmp != null) {
                    //新增負面詞庫(英文自動轉小寫)
                    addNegativeWords( tmp.trim().toLowerCase() );
                    tmp = br.readLine();
                }
                br.close();
            }
        }
        catch (Exception e) {
            System.out.println("File of Negative Words Not Found");
            e.printStackTrace();
        }
        try {
            log.debug("Accessing :{}", filenameADV);
            // access negative words
            FileReader fr = new FileReader(filenameADV);
            BufferedReader br = new BufferedReader(fr);
            String tmp = br.readLine();
            while(tmp != null) {
                myadv.put(tmp.trim() , true);
                tmp = br.readLine();
            }
//            br.close();
            IOUtils.closeQuietly(br);
            IOUtils.closeQuietly(fr);
        }
        catch (Exception e) {
            System.out.println("File of Adverbs Not Found");
            e.printStackTrace();
        }
    }

}


再從斷字後的關鍵字中查找正面情緒詞庫和負面情緒詞庫,並計算出現次數

承認(1) | 疏失(-1) | 檢討(-1) | 處分(-1) | 衛生(1) | 衛生(1) | 清潔(1)

統計所有正負評分
正面關鍵字分數6
負面關鍵字分數8
6-8 = -2, 即本文章情緒分數總分
{
  "time": "2 mSec",
  "lexicon": {
    "positive": 6,
    "negative": 8
  },
  "comment": "Negative",
  "result": [
    {
      "keyword": "衛生",
      "value": 2
    },
    {
      "keyword": "承認",
      "value": 1
    },
    {
      "keyword": "清潔",
      "value": 1
    },
    {
      "keyword": "福利",
      "value": 1
    },
    {
      "keyword": "通知",
      "value": 1
    },
    {
      "keyword": "毒",
      "value": -2
    },
    {
      "keyword": "要求",
      "value": -1
    },
    {
      "keyword": "處分",
      "value": -1
    },
    {
      "keyword": "檢討",
      "value": -1
    },
    {
      "keyword": "疾病",
      "value": -1
    },
    {
      "keyword": "沒有",
      "value": -1
    },
    {
      "keyword": "疏失",
      "value": -1
    }
  ]
}

3.4 測試情緒分析

org.iwlp.controller.SemanticAnalysisController
@RequestMapping(method=RequestMethod.POST,value="/sentimentAnalysis")
    @ApiOperation(value = "情緒分析",notes="根據文章內容正面與負面關鍵字分析文章正負評價")
    @ApiImplicitParams({
//        @ApiImplicitParam(name = "text", value = "text", required = true, defaultValue = "這酒店骯髒環境差,服務人員態度不佳,不敢恭維",dataType = "string", paramType = "query"),
        @ApiImplicitParam(name = "topN", value = "正負面詞庫數量門檻</br>-1:不限數量,default:10", required = false, defaultValue = "10",dataType = "int", paramType = "query"),
        @ApiImplicitParam(name = "isRefresh", value = "是否刷新字典檔", required = true, defaultValue = "false",dataType = "boolean", paramType = "query")
        })
    public Object sentimentAnalysis(
//            @RequestParam(value = "text") String text,
            @RequestParam(required = false, value = "topN") Integer topN,
            @RequestParam(required=true, value = "isRefresh") Boolean isRefresh,
            @RequestBody(required=true) final TextBody body
            ) {
        if(topN == null){
            topN = 10;
        }
        AssertUtils.isMoreThan(topN, -1);
       
        TextBody textBody = gson.fromJson(gson.toJson(body), TextBody.class);
        Sentiment sentiment = new Sentiment();
        String positiveDict = StreamUtils.getResourceURL(Segment.class, "sa/positive.txt").getFile();
        String negativeDict = StreamUtils.getResourceURL(Segment.class, "sa/negative.txt").getFile();
        String advDict = StreamUtils.getResourceURL(Segment.class, "sa/adv.txt").getFile();
        String trainingFile = StreamUtils.getResourceURL(Segment.class, "sa/training.txt").getFile();
        String trainingAnswer = StreamUtils.getResourceURL(Segment.class, "sa/answer.txt").getFile();
       
//        String option = StreamUtils.getResourceURL(Segment.class, "sa/opinion.txt").getFile();
//        String result = StreamUtils.getResourceURL(Segment.class, "sa/sa-result.txt").getFile();
       
//        log.debug("positiveDict:{}",positiveDict);
//        log.debug("negativeDict:{}",negativeDict);
//        log.debug("advDict:{}",advDict);
//        log.debug("trainingFile:{}",trainingFile);
//        log.debug("trainingAnswer:{}",trainingAnswer);
//        log.debug("option:{}",option);
//        log.debug("result:{}",result);
       
        SentimentAnalyzer.setDictionary(positiveDict, negativeDict, advDict);
        SentimentAnalyzer.setTrainingData(trainingFile, trainingAnswer);
       
        //set PMI-SO Rate, deciding how strictly the analyzer chooses words (default = 3.0)
        SentimentAnalyzer.setSORate(3.0);
        //set the number of threads available for the analyzer (default = 4)
        SentimentAnalyzer.setNTHREADS(4);
        //create the analyzer wth I/O filename (default= ./docs/opinion.txt, ./result.txt)
       
//        String posByTrainingPath, String negByTrainingPath
//        String posByTrainingPath = StreamUtils.getResourceURL(Segment.class, "sa/positive.txt").getFile();
//        String negByTrainingPath = StreamUtils.getResourceURL(Segment.class, "sa/negative.txt").getFile();
//        SentimentAnalyzer sa = new SentimentAnalyzer(option, result,  posByTrainingPath, negByTrainingPath, isRefresh);
       
        SentimentAnalyzer sa = new SentimentAnalyzer(isRefresh);
        sentiment = sa.calculateSentiment(textBody.getText(), topN);
//        sa.workAndWriteFile();
      
        return sentiment;
    }
{
  "time": "2 mSec",
  "lexicon": {
    "positive": 6,
    "negative": 8
  },
  "comment": "Negative",
  "result": [
    {
      "keyword": "衛生",
      "value": 2
    },
    {
      "keyword": "承認",
      "value": 1
    },
    {
      "keyword": "清潔",
      "value": 1
    },
    {
      "keyword": "福利",
      "value": 1
    },
    {
      "keyword": "通知",
      "value": 1
    },
    {
      "keyword": "毒",
      "value": -2
    },
    {
      "keyword": "要求",
      "value": -1
    },
    {
      "keyword": "處分",
      "value": -1
    },
    {
      "keyword": "檢討",
      "value": -1
    },
    {
      "keyword": "疾病",
      "value": -1
    },
    {
      "keyword": "沒有",
      "value": -1
    },
    {
      "keyword": "疏失",
      "value": -1
    }
  ]
}

4.API DOC

http://ift.tt/2oG3qc8

5.專案配置檔
Gradle configuation
build.gradle
buildscript {
       ext {
             springBootVersion = '1.4.2.RELEASE'
       }
       repositories {
             mavenCentral()
       }
       dependencies {
              classpath("org.springframework.boot:spring-boot-gradle-plugin:${springBootVersion}")
       }
}
apply plugin: 'java'
apply plugin: 'eclipse-wtp'
apply plugin: 'org.springframework.boot'
apply plugin: 'war'
war {
       baseName = 'iwlp-sa'
       version = '0.0.1-SNAPSHOT'
}
sourceCompatibility = 1.8
targetCompatibility = 1.8
repositories {
       mavenCentral()
}
configurations {
       providedRuntime
}
dependencies {
       compile('org.springframework.boot:spring-boot-starter-web')
       runtime('mysql:mysql-connector-java')
       providedRuntime('org.springframework.boot:spring-boot-starter-tomcat')
       testCompile('org.springframework.boot:spring-boot-starter-test')
       
       //swagger
       compile group: 'io.springfox', name: 'springfox-swagger2', version: '2.5.0'
       compile group: 'io.springfox', name: 'springfox-swagger-ui', version: '2.5.0'
       
       compile group: 'com.restfb', name: 'restfb', version: '1.31.0'
       
       // http://ift.tt/2o4C7vm
       compile group: 'org.jsoup', name: 'jsoup', version: '1.9.2'
       
       //FOR CrawlerPack
       compile group: 'com.googlecode.juniversalchardet', name: 'juniversalchardet', version: '1.0.3'
       
       //中文斷詞
       // http://ift.tt/2nRHhdx
       compile group: 'com.chenlb.mmseg4j', name: 'mmseg4j-core', version: '1.10.0'
       compile group: 'com.chenlb.mmseg4j', name: 'mmseg4j-analysis', version: '1.9.1'
       compile group: 'com.chenlb.mmseg4j', name: 'mmseg4j-core-with-dic', version: '1.8.6'
       compile group: 'com.google.code.gson', name: 'gson', version: '2.7'
       
       compile group: 'commons-io', name: 'commons-io', version: '2.5'
       compile group: 'commons-lang', name: 'commons-lang', version: '2.6'
       
       compile group: 'org.apache.commons', name: 'commons-vfs2', version: '2.1'
       //http client
       compile group: 'org.apache.httpcomponents', name: 'httpcore', version: '4.4.5'
       compile group: 'org.apache.commons', name: 'commons-compress', version: '1.12'
       compile group: 'commons-httpclient', name: 'commons-httpclient', version: '3.1'
       compile group: 'org.apache.httpcomponents', name: 'httpclient', version: '4.5.2'
       compile group: 'org.apache.httpcomponents', name: 'httpmime', version: '4.5.2'
}
springBoot {
    mainClass = "org.iwlp.IwlpSaApplication"
}

Spring Boot configuation
application.properties
server.contextPath=/iwlp-sa/
#logging
logging.pattern.console=%d{HH:mm:ss.SSS} [%thread] %-5level %logger{26}.%M [%line] - %msg%n%rEx
logging.level.org.springframework.web=WARN
logging.level.root=WARN
logging.level.org.iwlp.config=DEBUG
logging.level.org.iwlp.controller=DEBUG
logging.level.org.iwlp.core=DEBUG
endpoints.jmx.domain=iwlp-sa
endpoints.jmx.uniqueNames=true


6.專案下載



Tags: Spring, Spring Boot, Sentiment Analyzer, IFTTT-SYNC
November 30, 2016 at 02:23PM
Open in Evernote

沒有留言:

張貼留言