Describe 2011//Machine-Learning/NaiveBayesClassifier/ here
Train.java
Train Politics.txt 파 : 0.94 (94%)
평 : 0.885 (88.5%)
(http, //, blog, yahoo, empas, tistory ) 하 할 했.
한 Advantage 했, 해 Section Unique 하 클 Advantage 크 했. (하 한 Section .)
0.5% ..
^^;;
Train.java
package org.zeropage.machinelearn;
import java.io.*;
import java.util.*;
class Trainer {
private Map<String,Integer> sectionWord;
private int sectionWordNum;
private int sectionArticleNum;
private File fileName;
// 크하 함. False 환
private boolean isSkipData(String inputStr) {
if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") ||
inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") ||
inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") ||
inputStr.equals("//") || inputStr.equals("")) {
return true;
}
else { return false; }
}
//
public Trainer(File f) {
this.fileName = f;
}
// Data 한 학 행
public void TrainData() {
this.sectionWordNum = 0;
this.sectionArticleNum = 0;
this.sectionWord = new HashMap<String,Integer>();
try {
Scanner sectionLearn = new Scanner(this.fileName);
while(sectionLearn.hasNextLine()) {
this.sectionArticleNum++;
String[] a = sectionLearn.nextLine().split("\\s+");
for(String wordTmp:a) {
if(isSkipData(wordTmp)) {continue;} // 1Data, 트, , 페
if( this.sectionWord.get(wordTmp) == null) { // 해
this.sectionWord.put(wordTmp, 1); // 1
}
else { this.sectionWord.put(wordTmp, this.sectionWord.get(wordTmp)+1); } // 한
this.sectionWordNum++; //Word Count 하 킨.
}
}
sectionLearn.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
// 학 환
public HashMap<String,Integer> getSectionData() {
return (HashMap<String, Integer>) this.sectionWord;
}
// 환
public int getSectionWordsNumber() {
return this.sectionWordNum;
}
// 환, 1 환
public int getSectionWordNumber(String word) {
return (sectionWord.get(word) == null) ? 1 : sectionWord.get(word)+1;
}
// 환
public int getSectionArticleNumber() {
return this.sectionArticleNum;
}
}
Analyzer.java
package org.zeropage.machinelearn;
import java.util.*;
import java.io.*;
public class Analyzer {
private Trainer[] sectionTrain;
private int notInSectionArticleSum = 0;
private int notInSectionWordSum = 0;
private int notInSectionWordTotalSum = 0;
// Section Calculate 하 함. Index 하 행 화 한.
private void CalculateNotInSection(int index) {
this.notInSectionArticleSum = 0;
for(int i = 0; i < sectionTrain.length; i++) {
if(i != index) { notInSectionArticleSum += sectionTrain[i].getSectionArticleNumber(); }
}
this.notInSectionWordTotalSum = 0;
for(int i = 0; i < sectionTrain.length; i++) {
if(i != index) { notInSectionWordTotalSum += sectionTrain[i].getSectionWordsNumber(); }
}
}
//해 한 Section Calculate 하 함. Index 하 행 화 한.
private void CalculateNotInSectionWord(int index, String word) {
this.notInSectionWordSum = 0;
for(int i = 0; i < sectionTrain.length; i++) {
if(i != index) { notInSectionWordSum += sectionTrain[i].getSectionWordNumber(word); }
}
}
//해 한 환하 함. 해 Index Section 하 판한 , 해 Index Section 하 판한 .
private double getWeight(int index, String Article) {
double reslt = getLnPsPns(index);
for(String wordTmp:Article.split("\\s+")) {
reslt += getLnPwsPwns(index, wordTmp);
}
return reslt;
}
// Ln[p(S) / p(!S)] 하 함. Index 한.
private double getLnPsPns(int index) {
return Math.log((double)sectionTrain[index].getSectionArticleNumber() / notInSectionArticleSum);
}
// Sigma Ln[p(Wi ^ S) / p(Wi ^ !S)] 하 함. Index 한. 한 Advantage 한. (Advantage 함 )
private double getLnPwsPwns(int index, String word) {
CalculateNotInSectionWord(index, word);
return Math.log(((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionWordsNumber()) / ((double)ArticleAdvantage(index, word) / notInSectionWordTotalSum));
}
// 한 Advantage 함. 해 Index Section 하 한. 해 / Section * 50 .
private double ArticleAdvantage(int index, String word) {
double advantageResult = 0;
for(int i = 0; i < sectionTrain.length; i++) {
if(i != index) {
if(sectionTrain[i].getSectionWordNumber(word) == 1 && sectionTrain[index].getSectionWordNumber(word) > 1) {
advantageResult += (1 - ((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionArticleNumber() * 50));
}
else { advantageResult += sectionTrain[i].getSectionWordNumber(word); }
}
}
return advantageResult;
}
// 해 File 한 Index Section 함. , 한 판 확 환한.
public void DocumentResult(File f, int index) {
int negaNum = 0;
int posiNum = 0;
CalculateNotInSection(index);
try {
Scanner targetDocument = new Scanner(f);
while(targetDocument.hasNextLine()) {
if(getWeight(index, targetDocument.nextLine()) < 0) { negaNum++; }
else { posiNum++; }
}
targetDocument.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
System.out.println("Right : " + posiNum + " Wrong : " + negaNum + " Result : " + (getLnPsPns(index) + ((double)posiNum / (posiNum+negaNum))));
}
// . File 해.
public Analyzer(File[] dataList) {
this.sectionTrain = new Trainer[dataList.length];
for(int i = 0; i < sectionTrain.length; i++) {
sectionTrain[i] = new Trainer(dataList[i]);
sectionTrain[i].TrainData();
}
}
}
Runner.java
package org.zeropage.machinelearn;
import java.io.File;
public class Runner {
public static void main(String[] args) {
File[] dbList =
{
new File("svm_data.tar/package/train/economy/index.economy.db"),
new File("svm_data.tar/package/train/politics/index.politics.db"),
};
Analyzer anal = new Analyzer(dbList); // Section .
anal.DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), 0);
anal.DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), 1);
}
}
Train Economy.txt 파 : 0.83 (83%)Train Politics.txt 파 : 0.94 (94%)
평 : 0.885 (88.5%)
(http, //, blog, yahoo, empas, tistory ) 하 할 했.
한 Advantage 했, 해 Section Unique 하 클 Advantage 크 했. (하 한 Section .)
0.5% ..
^^;;










