Describe 데블2011//Machine-Learning/NaiveBayesClassifier/동 here
Train.java
Train Politics.txt : 0.94 (94%)
: 0.885 (88.5%)
럼 는 (http, //, blog, yahoo, empas, tistory ) 를 .
Advantage 를 부는데, Section Unique 빈 Advantage를 부. (만 Section 대.)
런 0.5% 밖 더..
를 볼 면 ^^;;
Train.java
package org.zeropage.machinelearn; import java.io.*; import java.util.*; class Trainer { private Map<String,Integer> sectionWord; private int sectionWordNum; private int sectionArticleNum; private File fileName; // 들 데 뢰 는 . 뢰 는 False private boolean isSkipData(String inputStr) { if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") || inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") || inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") || inputStr.equals("//") || inputStr.equals("블")) { return true; } else { return false; } } // public Trainer(File f) { this.fileName = f; } // Data 대 public void TrainData() { this.sectionWordNum = 0; this.sectionArticleNum = 0; this.sectionWord = new HashMap<String,Integer>(); try { Scanner sectionLearn = new Scanner(this.fileName); while(sectionLearn.hasNextLine()) { this.sectionArticleNum++; String[] a = sectionLearn.nextLine().split("\\s+"); for(String wordTmp:a) { if(isSkipData(wordTmp)) {continue;} // 1Data, , 블, 및 뢰 떨므 if( this.sectionWord.get(wordTmp) == null) { // 면 this.sectionWord.put(wordTmp, 1); // 1 } else { this.sectionWord.put(wordTmp, this.sectionWord.get(wordTmp)+1); } // 는 를 더 this.sectionWordNum++; //Word 는 복 를 Count 므 무 . } } sectionLearn.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } } // 데를 public HashMap<String,Integer> getSectionData() { return (HashMap<String, Integer>) this.sectionWord; } // 를 public int getSectionWordsNumber() { return this.sectionWordNum; } // 를 , 는 1 public int getSectionWordNumber(String word) { return (sectionWord.get(word) == null) ? 1 : sectionWord.get(word)+1; } // 를 public int getSectionArticleNumber() { return this.sectionArticleNum; } }Analyzer.java
package org.zeropage.machinelearn; import java.util.*; import java.io.*; public class Analyzer { private Trainer[] sectionTrain; private int notInSectionArticleSum = 0; private int notInSectionWordSum = 0; private int notInSectionWordTotalSum = 0; // Section 내 Calculate 는 . Index 며 . private void CalculateNotInSection(int index) { this.notInSectionArticleSum = 0; for(int i = 0; i < sectionTrain.length; i++) { if(i != index) { notInSectionArticleSum += sectionTrain[i].getSectionArticleNumber(); } } this.notInSectionWordTotalSum = 0; for(int i = 0; i < sectionTrain.length; i++) { if(i != index) { notInSectionWordTotalSum += sectionTrain[i].getSectionWordsNumber(); } } } // 대 Section 를 Calculate 는 . Index 대며 . private void CalculateNotInSectionWord(int index, String word) { this.notInSectionWordSum = 0; for(int i = 0; i < sectionTrain.length; i++) { if(i != index) { notInSectionWordSum += sectionTrain[i].getSectionWordNumber(word); } } } // 대 를 는 . Index Section 는 며, Index Section 는 . private double getWeight(int index, String Article) { double reslt = getLnPsPns(index); for(String wordTmp:Article.split("\\s+")) { reslt += getLnPwsPwns(index, wordTmp); } return reslt; } // Ln[p(S) / p(!S)] 는 . Index 대. private double getLnPsPns(int index) { return Math.log((double)sectionTrain[index].getSectionArticleNumber() / notInSectionArticleSum); } // Sigma Ln[p(Wi ^ S) / p(Wi ^ !S)] 는 . Index 대. 대 Advantage 를 부. (Advantage ) private double getLnPwsPwns(int index, String word) { CalculateNotInSectionWord(index, word); return Math.log(((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionWordsNumber()) / ((double)ArticleAdvantage(index, word) / notInSectionWordTotalSum)); } // 대 Advantage 부. Index Section 만 는 빈 따른 부. / Section * 50 . private double ArticleAdvantage(int index, String word) { double advantageResult = 0; for(int i = 0; i < sectionTrain.length; i++) { if(i != index) { if(sectionTrain[i].getSectionWordNumber(word) == 1 && sectionTrain[index].getSectionWordNumber(word) > 1) { advantageResult += (1 - ((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionArticleNumber() * 50)); } else { advantageResult += sectionTrain[i].getSectionWordNumber(word); } } } return advantageResult; } // File 변 대 Index Section 매 보는 . 맞 린, 리 대 . public void DocumentResult(File f, int index) { int negaNum = 0; int posiNum = 0; CalculateNotInSection(index); try { Scanner targetDocument = new Scanner(f); while(targetDocument.hasNextLine()) { if(getWeight(index, targetDocument.nextLine()) < 0) { negaNum++; } else { posiNum++; } } targetDocument.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } System.out.println("Right : " + posiNum + " Wrong : " + negaNum + " Result : " + (getLnPsPns(index) + ((double)posiNum / (posiNum+negaNum)))); } // . File 따 . public Analyzer(File[] dataList) { this.sectionTrain = new Trainer[dataList.length]; for(int i = 0; i < sectionTrain.length; i++) { sectionTrain[i] = new Trainer(dataList[i]); sectionTrain[i].TrainData(); } } }Runner.java
package org.zeropage.machinelearn; import java.io.File; public class Runner { public static void main(String[] args) { File[] dbList = { new File("svm_data.tar/package/train/economy/index.economy.db"), new File("svm_data.tar/package/train/politics/index.politics.db"), }; Analyzer anal = new Analyzer(dbList); // Section . anal.DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), 0); anal.DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), 1); } }Train Economy.txt : 0.83 (83%)
Train Politics.txt : 0.94 (94%)
: 0.885 (88.5%)
럼 는 (http, //, blog, yahoo, empas, tistory ) 를 .
Advantage 를 부는데, Section Unique 빈 Advantage를 부. (만 Section 대.)
런 0.5% 밖 더..
를 볼 면 ^^;;