Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here Train.java {{{ package org.zeropage.machinelearn; import java.io.*; import java.util.*; class Train { private Map economyWord; private Map politicWord; private int economyNum; private int politicNum; private boolean isSkipData(String inputStr) { // 자신의 사이트, 블로그, 페이지 주소의 경우 연관성이 떨어지므로 검색에서 제외 if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") || inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") || inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") || inputStr.equals("//") || inputStr.equals("블로그")) { return true; } else { return false; } } public void TrainData() { this.economyNum = 0; this.politicNum = 0; this.economyWord = new HashMap(); this.politicWord = new HashMap(); try { Scanner economyLearn = new Scanner(new File("svm_data.tar/package/train/economy/index.economy.db")); while(economyLearn.hasNextLine()) { String[] a = economyLearn.nextLine().split(" "); for(String wordTmp:a) { if(isSkipData(wordTmp)) {continue;} if( this.economyWord.get(wordTmp) == null) { this.economyNum++; this.economyWord.put(wordTmp, 1); } else { this.economyWord.put(wordTmp, this.economyWord.get(wordTmp)+1); } } } economyLearn.close(); Scanner politicLearn = new Scanner(new File("svm_data.tar/package/train/politics/index.politics.db")); while(politicLearn.hasNextLine()) { String[] a = politicLearn.nextLine().split(" "); for(String wordTmp:a) { if(isSkipData(wordTmp)) {continue; } if (this.politicWord.get(wordTmp) == null ) { this.politicNum++; this.politicWord.put(wordTmp, 1); } else { this.politicWord.put(wordTmp, this.politicWord.get(wordTmp)+1); } } } politicLearn.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public HashMap getEconomyData() { return (HashMap) this.economyWord; } public HashMap getPoliticData() { return (HashMap) this.politicWord; } public int getEconomyNumber() { return this.economyNum; } public int getPoliticNumber() { return this.politicNum; } } }}} Analyzer.java {{{ package org.zeropage.machinelearn; import java.util.*; import java.io.*; public class Analyzer { private static HashMap ecoData; private static HashMap polData; private static Train machineTrain; private static double DocumentResult(File f, boolean isEconomy) { double negaNum = 0; double posiNum = 0; double ecoResultNum = 0; double polResultNum = 0; double reslt = 0; try { Scanner targetDocument = new Scanner(f); while(targetDocument.hasNextLine()) { String[] a = targetDocument.nextLine().split(" "); for(String wordTmp:a) { if(ecoData.get(wordTmp) == null) { ecoResultNum = 0; } else { ecoResultNum = ecoData.get(wordTmp); } if(polData.get(wordTmp) == null) { polResultNum = 0; } else { polResultNum = polData.get(wordTmp); } ecoResultNum += 1; polResultNum += 1; if(isEconomy && polData.get(wordTmp) == null) { polResultNum -= 0.5; } // 경제파트이면서 정치쪽에 없는 단어에 Advantage 부과 if(!isEconomy && ecoData.get(wordTmp) == null) { ecoResultNum -= 0.5; } // 정치파트이면서 경제쪽에 없는 단어에 Adventage 부과 if(isEconomy) { reslt += Math.log(ecoResultNum / polResultNum); } else { reslt += Math.log(polResultNum / ecoResultNum); } } if(reslt < 0) { negaNum+= 1; } else { posiNum += 1; } reslt = 0; } targetDocument.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } return posiNum / (posiNum+negaNum); } public static void Init() { machineTrain = new Train(); machineTrain.TrainData(); ecoData = machineTrain.getEconomyData(); polData = machineTrain.getPoliticData(); } public static void main(String[] args) { Init(); double result1 = DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), true); System.out.println(result1); double result2 = DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), false); System.out.println(result2); System.out.println((result1 + result2) / 2); } } }}} Train 의 Economy.txt 파일 적중도 : 0.995 (99.5%) Train 의 Politics.txt 파일 적중도 : 0.96 (96%) 전체 평균 적중도 : 0.9775 (97.75%) 위의 주석처럼 약간의 Advantage 와 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다. 이 결과를 볼 수 있었으면 좋겠네요 ^^;;