吃Bug寫Code: 2016

Sentiment Analyzer (Spring Boot)

進行情緒分析時，需要有斷詞工具，透過斷字斷詞擷取出關鍵字

再透過關鍵字去判斷為正面或負面情緒，因此還需定義正負評價詞庫

最後透過計算正負關鍵字出現次數來決定該文的正負評

1.中文斷字斷詞

本文採用mmseg4j作為中文的斷字斷詞，會使用mmseg4j原因是可自訂詞庫

可參考

GitHub:http://ift.tt/2nRPbDU

mmseg4j 中文斷詞java 實作:http://ift.tt/2oFGaL9

中文分詞器性能比較:http://ift.tt/SrFUiJ

1.1 詞庫目錄

首先在src\main\resources建立詞庫檔案

詞庫定義如下

1.2 載入多個詞庫

如下方Line 68指定詞庫所在路徑，將會自動載入words***.dic的詞庫檔案(words開頭.dic結尾)

目的為將詞庫分類，較容易管理與擴充，Line 62路徑即為 1.1詞庫目錄

1.3 測試斷詞功能

測試文句:

疾管署今天表示，9月在高雄國際航空站捕到漢他病毒鼠，但沒有即時告知。雖未釀成疫情，疾管署承認疏失，將檢討內部流程並相關處分。

org.iwlp.controller.SemanticAnalysisController

@RequestMapping(method=RequestMethod.POST,value="/segment")

@ApiOperation(value = "中文斷詞",notes="輸入中文斷詞處理的文章內容，檢視斷詞結果")

@ApiImplicitParams({

@ApiImplicitParam(name = "text", value = "text", required = true, defaultValue = "這行文字是要被中文斷詞處理的文章，可以從執行結果看斷詞是否成功",dataType = "string", paramType = "query"),

@ApiImplicitParam(name = "fieldType", value = "fieldType", required = false, defaultValue = "ComplexSeg",dataType = "string", paramType = "query"),

@ApiImplicitParam(name = "isRefresh", value = "是否刷新字典檔", required = true, defaultValue = "false",dataType = "boolean", paramType = "query")

})

public Object segment(

@RequestParam(required=true, value = "text") String text,

@RequestParam(required = false, value = "fieldType") FieldTypeEnum fieldType,

@RequestParam(required=true, value = "isRefresh") Boolean isRefresh

) {

String result = null;

Segment segment = new Segment(isRefresh);

if(fieldType == null){

fieldType = FieldTypeEnum.ComplexSeg;

}

try {

result = segment.segWords(text, " | ", fieldType);

} catch (IOException e) {

e.printStackTrace();

}

TextBody textBody = new TextBody();

textBody.setText(text);

return result;

}

疾 | 管 | 署 | 今天 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕 | 到 | 漢 | 他 | 病毒 | 鼠 | 但沒 | 有 | 即時 | 告知 | 雖未 | 釀成 | 疫情 | 疾 | 管 | 署 | 承認 | 疏失 | 將 | 檢討 | 內部 | 流程 | 並 | 相關 | 處分

1.4 調整詞庫

src\main\resources\data\words-lexicon.dic

最後一行加入自訂詞庫"疾管署", "漢他病毒"

輸出結果

疾管署 | 今天 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕 | 到 | 漢他病毒 | 鼠 | 但沒 | 有 | 即時 | 告知 | 雖未 | 釀成 | 疫情 | 疾管署 | 承認 | 疏失 | 將 | 檢討 | 內部 | 流程 | 並 | 相關 | 處分

1.5 斷詞(分詞)完整程式碼

package org.iwlp.core;

import java.io.IOException;

import java.io.Reader;

import java.io.StringReader;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.HashSet;

import java.util.LinkedHashMap;

import java.util.List;

import java.util.Map;

import java.util.Map.Entry;

import java.util.Set;

import org.apache.commons.io.IOUtils;

import org.iwlp.model.FieldTypeEnum;

import org.iwlp.model.result.KeyWordFrequency;

import org.iwlp.model.result.KeyWordsStatistic;

import org.iwlp.utils.MapSortingUtils;

import org.iwlp.utils.StreamUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import com.chenlb.mmseg4j.ComplexSeg;

import com.chenlb.mmseg4j.Dictionary;

import com.chenlb.mmseg4j.MMSeg;

import com.chenlb.mmseg4j.MaxWordSeg;

import com.chenlb.mmseg4j.Seg;

import com.chenlb.mmseg4j.SimpleSeg;

import com.chenlb.mmseg4j.Word;

import com.google.gson.Gson;

import com.google.gson.GsonBuilder;

/**

 * 程式資訊摘要：分詞<P>

 * 類別名稱　　：Segment.java<P>

 * 程式內容說明：<P>

 * 程式修改記錄：<P>

 * date：2016年9月8日<P>

 *@author A50319

 *@version 1.0

 *@since 1.0

 */

public class Segment {

    private Dictionary dic;

    private static final Logger log = LoggerFactory.getLogger(Segment.class);

    /**

     * Segment建構子

     * @param isRefresh : 是否重新刷新Dictionary

     */

    public Segment(boolean isRefresh) {

        initDictionary(isRefresh);

    }

    private void initDictionary(boolean isRefresh){

        String filePath = StreamUtils.getResourceURL(Segment.class, "data").getFile();

        if(isRefresh){

            Dictionary.clear(filePath);

            log.debug("path:{}", filePath);

            log.debug("Dictionary refresh sataus:{}", isRefresh);

        }

        System.setProperty("mmseg.dic.path", filePath);    //這裡可以指定自訂詞庫

        dic = Dictionary.getInstance();

    }

    /**

     * 預設SEG

     * @return ComplexSeg

     */

    private Seg getSeg() {

        return new ComplexSeg(dic);

    }

    private Seg getSeg(FieldTypeEnum fieldType) {

        Seg seg = null;

//        log.debug("fieldType:{}", fieldType.name());

       if(fieldType == FieldTypeEnum.SimpleSeg){

//           log.debug("SimpleSeg:{}", FieldTypeEnum.SimpleSeg.name());

           seg = new SimpleSeg(dic);

       } else if(fieldType == FieldTypeEnum.ComplexSeg){

//           log.debug("ComplexSeg:{}", FieldTypeEnum.ComplexSeg.name());

           seg = new ComplexSeg(dic);

       } else if(fieldType == FieldTypeEnum.MaxWordSeg){

//           log.debug("MaxWordSeg:{}", FieldTypeEnum.MaxWordSeg.name());

           seg = new MaxWordSeg(dic);

       } else{

           seg = getSeg();

       }

        return seg;

    }

    /**

     * 斷詞(預設ComplexSeg分詞器)

     * @param text : 內文

     * @param wordSpilt : 分割標記文字

     * @return String

     * @throws IOException

     */

    public String segWords(String text, String wordSpilt) throws IOException {

        Reader inputReader = new StringReader(text);

        StringBuilder sb = new StringBuilder();

        Seg seg = getSeg();

        MMSeg mmSeg = new MMSeg(inputReader, seg);

        Word word = null;

        boolean first = true;

        while((word=mmSeg.next())!=null) {

            if(!first) {

                sb.append(wordSpilt);

            }

            String w = word.getString();

            sb.append(w);

            first = false;

        }

        return sb.toString();

    }

    /**

     * 斷詞

     * @param text : 內文

     * @param wordSpilt : 分割標記文字

     * @param fieldType :　分詞器類型

     * @return String

     * @throws IOException

     */

    public String segWords(String text, String wordSpilt, FieldTypeEnum fieldType) throws IOException {

        Reader inputReader = new StringReader(text);

        StringBuilder sb = new StringBuilder();

        Seg seg = getSeg(fieldType);

        MMSeg mmSeg = new MMSeg(inputReader, seg);

        Word word = null;

        boolean first = true;

        while((word=mmSeg.next())!=null) {

            if(!first) {

                sb.append(wordSpilt);

            }

            String w = word.getString();

            sb.append(w);

            first = false;

        }

        return sb.toString();

    }

    /**

     * 斷詞(預設ComplexSeg分詞器)

     * @param text : 內文

     * @return List<String> words

     */

    public List<String> segWords(String text) {

        List<String> words = new ArrayList<String>();

        Reader inputReader = new StringReader(text);

        Seg seg = getSeg();

        MMSeg mmSeg = new MMSeg(inputReader, seg);

        Word word = null;

        try {

            while((word=mmSeg.next())!=null) {

                words.add(word.getString());

            }

        } catch (IOException e) {

            e.printStackTrace();

        } finally{

            IOUtils.closeQuietly(inputReader);

        }

        return words;

    }

    /**

     * 斷詞

     * @param text : 內文

     * @param fieldType : 分詞器類型

     * @return List<String> words

     */

    public List<String> segWords(String text, FieldTypeEnum fieldType) {

        List<String> words = new ArrayList<String>();

        Reader inputReader = new StringReader(text);

        Seg seg = getSeg(fieldType);

        MMSeg mmSeg = new MMSeg(inputReader, seg);

        Word word = null;

        try {

            while((word=mmSeg.next())!=null) {

                words.add(word.getString());

            }

        } catch (IOException e) {

            e.printStackTrace();

        } finally{

            IOUtils.closeQuietly(inputReader);

        }

        return words;

    }

    /**

     * 計算關鍵字出現頻率

     * @param text : 內文

     * @param fieldType : 分詞器類型

     * @return Map<String, Integer>

     */

    public Map<String, Integer> calculateKeywordsMap(String text, FieldTypeEnum fieldType){

        Map<String, Integer> keywords = new LinkedHashMap<String, Integer>();

        Map<String, Object> tempKeywords = new HashMap<String, Object>();

        List<String> words = segWords(text, fieldType);

        for(String keyword : words){

            if(tempKeywords.containsKey(keyword)){

                int count = ((int) tempKeywords.get(keyword)) + 1;

                tempKeywords.put(keyword, count);

            } else{

                tempKeywords.put(keyword, 1);

            }

        }

        Gson gson = new GsonBuilder().create();

//        log.debug("sort before:\n{}",gson.toJson(MapSortingUtils.sortMapByValue(tempKeywords)));

        tempKeywords = MapSortingUtils.reverseMap(MapSortingUtils.sortMapByValue(tempKeywords));

//        log.debug("sort affter:\n{}",gson.toJson(tempKeywords));

        for(Entry<String, Object> entry : tempKeywords.entrySet()){

            String key = entry.getKey();

            Integer value = (Integer) entry.getValue();

            keywords.put(key, value);

//            log.debug("key:{}, value:{}",key, value);

        }

        tempKeywords.clear();

        tempKeywords = null;

        return keywords;

    }

    /**

     * 計算關鍵字出現頻率

     * @param text : 內文

     * @param fieldType : 分詞器類型

     * @return KeyWordsStatistic

     */

    public KeyWordsStatistic calculateKeywords(String text, FieldTypeEnum fieldType){

        KeyWordsStatistic ks = new KeyWordsStatistic();

        Map<String, Integer> keywords = calculateKeywordsMap(text, fieldType);

        ks.setTotal(keywords.size());

        List<KeyWordFrequency> kfs = new ArrayList<KeyWordFrequency>();

        for(Entry<String, Integer> entry : keywords.entrySet()){

            String key = entry.getKey();

            Integer value = (Integer) entry.getValue();

            KeyWordFrequency kf = new KeyWordFrequency();

            kf.setKeyWord(key);

            kf.setCount(value);

            kfs.add(kf);

        }

        ks.setResult(kfs);

        return ks;

    }

    /**

     * 計算關鍵字出現頻率

     * @param text : 內文

     * @param fieldType : 分詞器類型

     * @param minKeywordLength : 關鍵字最小長度門檻

     * @param minFrequency : 關鍵字出現最小次數門檻

     * @return KeyWordsStatistic

     */

    public KeyWordsStatistic calculateKeywords(String text, FieldTypeEnum fieldType, int minKeywordLength, int minFrequency){

        KeyWordsStatistic ks = new KeyWordsStatistic();

        Map<String, Integer> keywords = calculateKeywordsMap(text, fieldType);

        List<KeyWordFrequency> kfs = new ArrayList<KeyWordFrequency>();

        for(Entry<String, Integer> entry : keywords.entrySet()){

            String key = entry.getKey();

            Integer value = (Integer) entry.getValue();

            if(key.length() >= minKeywordLength){

                if(value >= minFrequency){

                    KeyWordFrequency kf = new KeyWordFrequency();

                    kf.setKeyWord(key);

                    kf.setCount(value);

                    kfs.add(kf);

                }

            }

        }

        ks.setResult(kfs);

        ks.setTotal(ks.getResult().size());

        return ks;

    }

}

2.關鍵字統計

此功能目的為統計詞庫檔案內關鍵字於文章中出現的頻率，只要斷字斷詞在統計出現次數即可

minKeywordLength:關鍵字最小文字長度，若值為3，則"疾管署 | 承認 | 疏失"，僅出現長度為3的關鍵字"疾管署"

minFrequency:關鍵字出現頻率，篩選關鍵字出現次數

2.1 測試關鍵字統計

org.iwlp.controller.SemanticAnalysisController

@RequestMapping(method=RequestMethod.POST,value="/keywords")

@ApiOperation(value = "關鍵字統計",notes="根據文章內容統計關鍵字出現次數")

@ApiImplicitParams({

@ApiImplicitParam(name = "isRefresh", value = "是否刷新字典檔", required = true, defaultValue = "false",dataType = "boolean", paramType = "query"),

@ApiImplicitParam(name = "fieldType", value = "fieldType", required = false, defaultValue = "ComplexSeg",dataType = "string", paramType = "query"),

@ApiImplicitParam(name = "minKeywordLength", value = "關鍵字最小文字長度", required = false, defaultValue = "1",dataType = "integer", paramType = "query"),

@ApiImplicitParam(name = "minFrequency", value = "關鍵字出現頻率", required = false, defaultValue = "1",dataType = "integer", paramType = "query")

})

public KeyWordsStatistic calculateKeywords(

@RequestParam(required=true, value = "isRefresh") Boolean isRefresh,

@RequestParam(required = false, value = "fieldType") FieldTypeEnum fieldType,

@RequestParam(required = false, value = "minKeywordLength") Integer minKeywordLength,

@RequestParam(required = false, value = "minFrequency") Integer minFrequency,

@RequestBody(required=true) final TextBody body

) {

if(minKeywordLength == null){

minKeywordLength = 1;

}

if(minFrequency == null){

minFrequency = 1;

}

AssertUtils.isMoreThan(minKeywordLength, 1);

AssertUtils.isMoreThan(minFrequency, 1);

TextBody textBody = gson.fromJson(gson.toJson(body), TextBody.class);

Segment segment = new Segment(isRefresh);

if(fieldType == null){

fieldType = FieldTypeEnum.ComplexSeg;

}

KeyWordsStatistic kws = segment.calculateKeywords(textBody.getText(), fieldType, minKeywordLength, minFrequency);

return kws;

}

{"total":29,"result":[{"keyword":"疾管署","count":2},{"keyword":"高雄","count":1},{"keyword":"9","count":1},{"keyword":"疫情","count":1},{"keyword":"到","count":1},{"keyword":"內部","count":1},{"keyword":"相關","count":1},{"keyword":"在","count":1},{"keyword":"流程","count":1},{"keyword":"並","count":1},{"keyword":"即時","count":1},{"keyword":"承認","count":1},{"keyword":"檢討","count":1},{"keyword":"鼠","count":1},{"keyword":"漢他病毒","count":1},{"keyword":"站","count":1},{"keyword":"國際航空","count":1},{"keyword":"今天","count":1},{"keyword":"捕","count":1},{"keyword":"告知","count":1},{"keyword":"疏失","count":1},{"keyword":"有","count":1},{"keyword":"月","count":1},{"keyword":"將","count":1},{"keyword":"雖未","count":1},{"keyword":"但沒","count":1},{"keyword":"釀成","count":1},{"keyword":"表示","count":1},{"keyword":"處分","count":1}]}

3.情緒詞庫

本文中情緒分析採用作者der3318所開發之情緒分析，並加以修改

根據第2章之中文斷詞機制將文章的關鍵字分離出來，並判斷這些關鍵字代表之情緒為正面或負面

經統計後可得情緒分析分數

可參考

SentimentAnalyzer:http://ift.tt/2oG3Yi3

內建採用結巴(jieba)中文斷詞，此處修改為mmseg4j，原因是原先系統已採用mmseg4j

3.1 情緒詞庫目錄

最主要為定義正面詞庫(positive.txt)以及負面詞庫(negative.txt)

positive.txt

negative.txt

3.2 中文斷詞設定

此處故意沒刪除結巴(jieba)中文斷詞之處，請參閱綠色被註解的地方

org.iwlp.core.analyzer.SegChinese

package org.iwlp.core.analyzer;

import java.io.IOException;

import java.util.ArrayList;

import org.iwlp.core.Segment;

import org.iwlp.model.FieldTypeEnum;

//import com.huaban.analysis.jieba.JiebaSegmenter;

//import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;

//import com.huaban.analysis.jieba.SegToken;

public class SegChinese {

// a static Segmenter shared by all analyzers

private static SegChinese seg;

//mmseg4j

private Segment segmenter;

public static boolean isRefresh;

//結巴分詞

// protected JiebaSegmenter segmenter;

// public SegChinese() {

// segmenter = new JiebaSegmenter();

// }

public SegChinese(boolean isRefresh) {

segmenter = new Segment(isRefresh);

}

// return the prepared Segmenter. if not found, create one

public static SegChinese getInstance() {

if (seg == null) {

synchronized (SegChinese.class) {

if (seg == null) {

seg = new SegChinese(isRefresh);

return seg;

}

return seg;

}

public static void removeInstance() {

seg = null;

}

public ArrayList<String> getSegList(String text) throws IOException {

return (ArrayList<String>) segmenter.segWords(text, FieldTypeEnum.ComplexSeg);

}

public String segWords(String text, String wordSpilt) throws IOException {

return segmenter.segWords(text, wordSpilt);

}

// public ArrayList<String> getSegList(String txt) throws IOException {

// ArrayList<String> output = new ArrayList<String>();

// for( SegToken token : segmenter.process(txt, SegMode.INDEX) ) if( !token.word.isEmpty() ) output.add(token.word);

// return output;

// }

// public String segWords(String txt, String wordSpilt) throws IOException {

// String output = new String("");

// for( SegToken token : segmenter.process(txt, SegMode.INDEX) ) output += (token.word + wordSpilt);

// return output;

// }

}

3.3 情緒分析流程

情緒計算方式為將文章

疾管署今天表示，9月在高雄國際航空站捕到漢他病毒鼠，但沒有即時告知。雖未釀成疫情，疾管署承認疏失，將檢討內部流程並相關處分。衛生福利部疾病管制署今天發布新聞稿表示，9月在高雄國際航空站捕獲錢鼠送實驗室檢驗檢出漢他病毒陽性，當時並未立即通知航空站，日前與航站定期召開衛生小組會議時，才告知並要求加強環境清潔與防鼠措施。

用以設定之中文斷詞進行斷句斷詞

疾管署 | 今天 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕 | 到 | 漢他病毒 | 鼠 | 但沒 | 有 | 即時 | 告知 | 雖未 | 釀成 | 疫情 | 疾管署 | 承認 | 疏失 | 將 | 檢討 | 內部 | 流程 | 並 | 相關 | 處分 | 衛生 | 福利 | 部 | 疾病 | 管制 | 署 | 今天 | 發 | 布 | 新聞稿 | 表示 | 9 | 月 | 在 | 高雄 | 國際航空 | 站 | 捕獲 | 錢鼠 | 送 | 實驗室 | 檢驗 | 檢出 | 漢他病毒 | 陽性 | 當時 | 並未 | 立即 | 通知 | 航空站 | 日前 | 與 | 航 | 站 | 定期 | 召開 | 衛生 | 小組 | 會議 | 時 | 才 | 告知 | 並要 | 求 | 加強 | 環境 | 清潔 | 與 | 防鼠 | 措施

之後再進行正負情緒關鍵字之初始化，即正面詞庫分數定為1分，負面詞庫定為-1分，此處將英文字母設定為轉為小寫

package org.iwlp.core.analyzer;

import java.io.BufferedReader;

import java.io.FileReader;

import java.util.ArrayList;

import java.util.HashMap;

import org.apache.commons.io.IOUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

public class SentimentalDictionary {

    private static final Logger log = LoggerFactory.getLogger(SentimentalDictionary.class);

    // a static dictionary shared by all analyzers 

    private static SentimentalDictionary dictionary;

    // filenames

    private static String filenameP = new String("./docs/positive.txt");

    private static String filenameN = new String("./docs/negative.txt");

    private static String filenameADV = new String("./docs/adv.txt");

    // a HashMap holding sentimental words as keys

    private HashMap<String, Integer> mydict = new HashMap<String, Integer>();

    // a HashMap holding adverbs as keys

    private HashMap<String, Boolean> myadv = new HashMap<String, Boolean>();

    // return the prepared dictionary. if not found, create one

    public static SentimentalDictionary getInstance() {

        if (dictionary == null) {

            synchronized (SentimentalDictionary.class) {

                if (dictionary == null) {

                    dictionary = new SentimentalDictionary();

                    dictionary.makeDict();

                    return dictionary;

                }

            }

        }

        return dictionary;

    }

    // remove the current dictionary due to some setting changes

    public static void removeInstance() {

        dictionary = null;

    }

    public static void setFilename(String _filenameP, String _filenameN, String _filenameADV) {

        filenameP = _filenameP;

        filenameN = _filenameN;

        filenameADV = _filenameADV;

    }

    // add a positive word into dictionary

    public synchronized void addPositiveWords(String _string) {

        if( mydict.containsKey(_string) )    mydict.put(_string, mydict.get(_string) + 1);

        else    mydict.put(_string, 1);

    }

    // add a negative word into dictionary

    public synchronized void addNegativeWords(String _string) {

        if( mydict.containsKey(_string) )    mydict.put(_string, mydict.get(_string) - 1);

        else    mydict.put(_string, -1);

    }

    // get the score of the sentimental word, and return 0 when not found

    public int checkWord(String _string) {

        if( _string.isEmpty() || !mydict.containsKey(_string) )    return 0;

        if( mydict.get(_string) > 0 )    return 1;

        return -1;

    }

    // check if the input word is adv or not

    public boolean checkAdv(String _string) {

        if( myadv.containsKey(_string) && !_string.isEmpty() )    return true;

        return false;

    }

    // return an ArrayList containing positive words

    public ArrayList<String> getPositiveWords() {

        ArrayList<String> output_list = new ArrayList<String>();

        for( String key : mydict.keySet() )    if( mydict.get(key) > 0 )    output_list.add(key);

        return output_list;

    }

    // return an ArrayList containing negative words

    public ArrayList<String> getNegativeWords() {

        ArrayList<String> output_list = new ArrayList<String>();

        for( String key : mydict.keySet() )    if( mydict.get(key) < 0 )    output_list.add(key);

        return output_list;

    }

    // get the size(numbers of words) of the dictionary

    public int getSize() {

        return mydict.size() + myadv.size();

    }

    // print dictionary

    public void printDict() {

        for( String key : mydict.keySet() )    System.out.println(key + ", " + mydict.get(key));

    }

    // put the words into the the HashMaps from 3 input files(positive sentimental words, negative sentimental words, adverbs)

    public void makeDict() {        

        try {

            // access positive words

            String[] filenames = {filenameP, /*"./docs/pos_by_training.txt"*/};

            for(String filename : filenames) {

                log.debug("Accessing :{}" , filename);

                FileReader fr = new FileReader(filename);

                BufferedReader br = new BufferedReader(fr);

                String tmp = br.readLine();

                while(tmp != null) {

                    //新增正面詞庫(英文自動轉小寫)

                    addPositiveWords( tmp.trim().toLowerCase() );

                    tmp = br.readLine();

                }

                br.close();

            }    

        }

        catch (Exception e) {

            System.out.println("File of Positive Words Not Found");

            e.printStackTrace();

        }

        try {

            // access negative words

            String[] filenames = {filenameN/*, "./docs/neg_by_training.txt"*/};

            for(String filename : filenames) {

//                System.out.println("Accessing " + filename);

                log.debug("Accessing:{}", filename);

                FileReader fr = new FileReader(filename);

                BufferedReader br = new BufferedReader(fr);

                String tmp = br.readLine();

                while(tmp != null) {

                    //新增負面詞庫(英文自動轉小寫)

                    addNegativeWords( tmp.trim().toLowerCase() );

                    tmp = br.readLine();

                }

                br.close();

            }

        }

        catch (Exception e) {

            System.out.println("File of Negative Words Not Found");

            e.printStackTrace();

        }

        try {

            log.debug("Accessing :{}", filenameADV);

            // access negative words

            FileReader fr = new FileReader(filenameADV);

            BufferedReader br = new BufferedReader(fr);

            String tmp = br.readLine();

            while(tmp != null) {

                myadv.put(tmp.trim() , true);

                tmp = br.readLine();

            }

//            br.close();

            IOUtils.closeQuietly(br);

            IOUtils.closeQuietly(fr);

        }

        catch (Exception e) {

            System.out.println("File of Adverbs Not Found");

            e.printStackTrace();

        }

    }

}

再從斷字後的關鍵字中查找正面情緒詞庫和負面情緒詞庫，並計算出現次數

承認(1) | 疏失(-1) | 檢討(-1) | 處分(-1) | 衛生(1) | 衛生(1) | 清潔(1)

統計所有正負評分

正面關鍵字分數6

負面關鍵字分數8

6-8 = -2, 即本文章情緒分數總分

{

  "time": "2 mSec",

  "lexicon": {

    "positive": 6,

    "negative": 8

  },

  "comment": "Negative",

  "result": [

    {

      "keyword": "衛生",

      "value": 2

    },

    {

      "keyword": "承認",

      "value": 1

    },

    {

      "keyword": "清潔",

      "value": 1

    },

    {

      "keyword": "福利",

      "value": 1

    },

    {

      "keyword": "通知",

      "value": 1

    },

    {

      "keyword": "毒",

      "value": -2

    },

    {

      "keyword": "要求",

      "value": -1

    },

    {

      "keyword": "處分",

      "value": -1

    },

    {

      "keyword": "檢討",

      "value": -1

    },

    {

      "keyword": "疾病",

      "value": -1

    },

    {

      "keyword": "沒有",

      "value": -1

    },

    {

      "keyword": "疏失",

      "value": -1

    }

  ]

}

3.4 測試情緒分析

org.iwlp.controller.SemanticAnalysisController

@RequestMapping(method=RequestMethod.POST,value="/sentimentAnalysis")

@ApiOperation(value = "情緒分析",notes="根據文章內容正面與負面關鍵字分析文章正負評價")

@ApiImplicitParams({

// @ApiImplicitParam(name = "text", value = "text", required = true, defaultValue = "這酒店骯髒環境差，服務人員態度不佳，不敢恭維",dataType = "string", paramType = "query"),

@ApiImplicitParam(name = "topN", value = "正負面詞庫數量門檻</br>-1:不限數量,default:10", required = false, defaultValue = "10",dataType = "int", paramType = "query"),

@ApiImplicitParam(name = "isRefresh", value = "是否刷新字典檔", required = true, defaultValue = "false",dataType = "boolean", paramType = "query")

})

public Object sentimentAnalysis(

// @RequestParam(value = "text") String text,

@RequestParam(required = false, value = "topN") Integer topN,

@RequestParam(required=true, value = "isRefresh") Boolean isRefresh,

@RequestBody(required=true) final TextBody body

) {

if(topN == null){

topN = 10;

}

AssertUtils.isMoreThan(topN, -1);

TextBody textBody = gson.fromJson(gson.toJson(body), TextBody.class);

Sentiment sentiment = new Sentiment();

String positiveDict = StreamUtils.getResourceURL(Segment.class, "sa/positive.txt").getFile();

String negativeDict = StreamUtils.getResourceURL(Segment.class, "sa/negative.txt").getFile();

String advDict = StreamUtils.getResourceURL(Segment.class, "sa/adv.txt").getFile();

String trainingFile = StreamUtils.getResourceURL(Segment.class, "sa/training.txt").getFile();

String trainingAnswer = StreamUtils.getResourceURL(Segment.class, "sa/answer.txt").getFile();

// String option = StreamUtils.getResourceURL(Segment.class, "sa/opinion.txt").getFile();

// String result = StreamUtils.getResourceURL(Segment.class, "sa/sa-result.txt").getFile();

// log.debug("positiveDict:{}",positiveDict);

// log.debug("negativeDict:{}",negativeDict);

// log.debug("advDict:{}",advDict);

// log.debug("trainingFile:{}",trainingFile);

// log.debug("trainingAnswer:{}",trainingAnswer);

// log.debug("option:{}",option);

// log.debug("result:{}",result);

SentimentAnalyzer.setDictionary(positiveDict, negativeDict, advDict);

SentimentAnalyzer.setTrainingData(trainingFile, trainingAnswer);

//set PMI-SO Rate, deciding how strictly the analyzer chooses words (default = 3.0)

SentimentAnalyzer.setSORate(3.0);

//set the number of threads available for the analyzer (default = 4)

SentimentAnalyzer.setNTHREADS(4);

//create the analyzer wth I/O filename (default= ./docs/opinion.txt, ./result.txt)

// String posByTrainingPath, String negByTrainingPath

// String posByTrainingPath = StreamUtils.getResourceURL(Segment.class, "sa/positive.txt").getFile();

// String negByTrainingPath = StreamUtils.getResourceURL(Segment.class, "sa/negative.txt").getFile();

// SentimentAnalyzer sa = new SentimentAnalyzer(option, result, posByTrainingPath, negByTrainingPath, isRefresh);

SentimentAnalyzer sa = new SentimentAnalyzer(isRefresh);

sentiment = sa.calculateSentiment(textBody.getText(), topN);

// sa.workAndWriteFile();

return sentiment;

}

{

"time": "2 mSec",

"lexicon": {

"positive": 6,

"negative": 8

"comment": "Negative",

"result": [

{

"keyword": "衛生",

"value": 2

{

"keyword": "承認",

"value": 1

{

"keyword": "清潔",

"value": 1

{

"keyword": "福利",

"value": 1

{

"keyword": "通知",

"value": 1

{

"keyword": "毒",

"value": -2

{

"keyword": "要求",

"value": -1

{

"keyword": "處分",

"value": -1

{

"keyword": "檢討",

"value": -1

{

"keyword": "疾病",

"value": -1

{

"keyword": "沒有",

"value": -1

{

"keyword": "疏失",

"value": -1

}

]

}

4.API DOC

http://ift.tt/2oG3qc8

5.專案配置檔

Gradle configuation

build.gradle

buildscript {

ext {

springBootVersion = '1.4.2.RELEASE'

}

repositories {

mavenCentral()

}

dependencies {

classpath("org.springframework.boot:spring-boot-gradle-plugin:${springBootVersion}")

}

apply plugin: 'java'

apply plugin: 'eclipse-wtp'

apply plugin: 'org.springframework.boot'

apply plugin: 'war'

war {

baseName = 'iwlp-sa'

version = '0.0.1-SNAPSHOT'

}

sourceCompatibility = 1.8

targetCompatibility = 1.8

repositories {

mavenCentral()

}

configurations {

providedRuntime

}

dependencies {

compile('org.springframework.boot:spring-boot-starter-web')

runtime('mysql:mysql-connector-java')

providedRuntime('org.springframework.boot:spring-boot-starter-tomcat')

testCompile('org.springframework.boot:spring-boot-starter-test')

//swagger

compile group: 'io.springfox', name: 'springfox-swagger2', version: '2.5.0'

compile group: 'io.springfox', name: 'springfox-swagger-ui', version: '2.5.0'

compile group: 'com.restfb', name: 'restfb', version: '1.31.0'

// http://ift.tt/2o4C7vm

compile group: 'org.jsoup', name: 'jsoup', version: '1.9.2'

//FOR CrawlerPack

compile group: 'com.googlecode.juniversalchardet', name: 'juniversalchardet', version: '1.0.3'

//中文斷詞

// http://ift.tt/2nRHhdx

compile group: 'com.chenlb.mmseg4j', name: 'mmseg4j-core', version: '1.10.0'

compile group: 'com.chenlb.mmseg4j', name: 'mmseg4j-analysis', version: '1.9.1'

compile group: 'com.chenlb.mmseg4j', name: 'mmseg4j-core-with-dic', version: '1.8.6'

compile group: 'com.google.code.gson', name: 'gson', version: '2.7'

compile group: 'commons-io', name: 'commons-io', version: '2.5'

compile group: 'commons-lang', name: 'commons-lang', version: '2.6'

compile group: 'org.apache.commons', name: 'commons-vfs2', version: '2.1'

//http client

compile group: 'org.apache.httpcomponents', name: 'httpcore', version: '4.4.5'

compile group: 'org.apache.commons', name: 'commons-compress', version: '1.12'

compile group: 'commons-httpclient', name: 'commons-httpclient', version: '3.1'

compile group: 'org.apache.httpcomponents', name: 'httpclient', version: '4.5.2'

compile group: 'org.apache.httpcomponents', name: 'httpmime', version: '4.5.2'

}

springBoot {

mainClass = "org.iwlp.IwlpSaApplication"

}

Spring Boot configuation

application.properties

server.contextPath=/iwlp-sa/

#logging

logging.pattern.console=%d{HH:mm:ss.SSS} [%thread] %-5level %logger{26}.%M [%line] - %msg%n%rEx

logging.level.org.springframework.web=WARN

logging.level.root=WARN

logging.level.org.iwlp.config=DEBUG

logging.level.org.iwlp.controller=DEBUG

logging.level.org.iwlp.core=DEBUG

endpoints.jmx.domain=iwlp-sa

endpoints.jmx.uniqueNames=true

6.專案下載

http://ift.tt/2oG1Erj

Tags: Spring, Spring Boot, Sentiment Analyzer, IFTTT-SYNC
November 30, 2016 at 02:23PM
Open in Evernote

2016年11月30日 星期三

Sentiment Analyzer (Spring Boot)

2016年11月30日星期三