데블스캠프2011/둘째날/Machine-Learning/Naive Bayes Classifier/김동준

Difference between r1.5 and the current

@@ -6,12 +6,13 @@

import java.io.*;
import java.util.*;

class ~~Train~~ {

private Map<String,Integer> ~~economyWord~~;

private ~~Map<String,Integer>~~ ~~politicWord~~;

private int ~~economyNum~~;

private ~~int~~ ~~politicNum~~;

~~private~~ ~~boolean~~ ~~isSkipData(String~~ ~~inputStr)~~ { // ~~자신의~~ ~~사이트,~~ ~~블로그,~~ ~~페이지~~ ~~주소의~~ 경우 ~~연관성이~~ ~~떨어지므로 검색에서~~ 제외

class Trainer {

private Map<String,Integer> sectionWord;

private int sectionWordNum;

private int sectionArticleNum;

private File fileName;

// 들어온 데이터의 신뢰성을 체크하는 함수. 신뢰성이 없는경우 False 반환

private boolean isSkipData(String inputStr) {

if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") ||
inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") ||
inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") ||

@@ -20,57 +21,49 @@

}
else { return false; }
}

// 생성자

public Trainer(File f) {

this.fileName = f;

}

// Data에 대한 학습 시행

public void TrainData() {

this.~~economyNum~~ = 0;

this.~~politicNum~~ = 0;

this.~~economyWord = new HashMap<String,Integer>();~~

~~this.politicWord~~ = new HashMap<String,Integer>();

this.sectionWordNum = 0;

this.sectionArticleNum = 0;

this.sectionWord = new HashMap<String,Integer>();

try {

Scanner ~~economyLearn~~ = new Scanner(~~new File("svm_data~~.~~tar/package/train/economy/index.economy.db")~~);

while(~~economyLearn~~.hasNextLine()) {

~~String[] a = economyLearn.nextLine().split(" ");~~

~~for(String wordTmp:a) {~~

~~if(isSkipData(wordTmp)) {continue;}~~

~~if( this.economyWord.get(wordTmp) == null) {~~

this.~~economyNum~~++;

~~this.economyWord.put(wordTmp, 1);~~

}

~~else { this.economyWord.put(wordTmp, this.economyWord.get(wordTmp)+1); }~~

}

~~economyLearn.close();~~

~~Scanner politicLearn = new Scanner(new File("svm_data.tar/package/train/politics/index.politics.db"));~~

~~while(politicLearn.hasNextLine()) {~~

String[] a = ~~politicLearn~~.nextLine().split(" ");

Scanner sectionLearn = new Scanner(this.fileName);

while(sectionLearn.hasNextLine()) {

this.sectionArticleNum++;

String[] a = sectionLearn.nextLine().split("\\s+");

for(String wordTmp:a) {

if(isSkipData(wordTmp)) {continue; }

if (this.~~politicWord~~.get(wordTmp) == null ) {

this.~~politicNum++;~~

~~this.politicWord~~.put(wordTmp, 1);

if(isSkipData(wordTmp)) {continue;} // 1글자Data, 사이트, 블로그, 페이지 주소의 경우 연관성및 신뢰성이 떨어지므로 검색에서 제외

if( this.sectionWord.get(wordTmp) == null) { // 해당 단어가 없으면

this.sectionWord.put(wordTmp, 1); // 1개로 새로 넣고 추가

}

else { this.~~politicWord~~.put(wordTmp, this.~~politicWord~~.get(wordTmp)+1); }

else { this.sectionWord.put(wordTmp, this.sectionWord.get(wordTmp)+1); } // 있는경우 자신의 것에 갯수를 한개 더 추가

this.sectionWordNum++; //Word 수는 중복과 상관없이 갯수를 Count 하므로 무조건 증가시킨다.

}
}

~~politicLearn~~.close();

sectionLearn.close();

} catch (FileNotFoundException e) {

~~// TODO Auto-generated catch block~~

~~e.printStackTrace();~~

~~} catch (IOException e) {~~

~~// TODO Auto-generated catch block~~

e.printStackTrace();
}
}

public HashMap<String,Integer> ~~getEconomyData~~() {

return (HashMap<String, Integer>) this.~~economyWord~~;

// 학습된 데이터를 반환

public HashMap<String,Integer> getSectionData() {

return (HashMap<String, Integer>) this.sectionWord;

}

public ~~HashMap<String,Integer>~~ ~~getPoliticData~~() {

return ~~(HashMap<String, Integer>)~~ this.~~politicWord~~;

// 총 단어수를 반환

public int getSectionWordsNumber() {

return this.sectionWordNum;

}

public int ~~getEconomyNumber~~() {

return ~~this~~.~~economyNum~~;

// 특정 단어의 갯수를 반환, 없는경우 1 반환

public int getSectionWordNumber(String word) {

return (sectionWord.get(word) == null) ? 1 : sectionWord.get(word)+1;

}

public int ~~getPoliticNumber~~() {

return this.~~politicNum~~;

// 전체 기사의 수를 반환

public int getSectionArticleNumber() {

return this.sectionArticleNum;

}
}
}}}

@@ -82,61 +75,108 @@

import java.io.*;

public class Analyzer {

private ~~static~~ ~~HashMap~~<~~String,Integer>~~ ~~ecoData~~;

private ~~static~~ ~~HashMap~~<String,~~Integer>~~ ~~polData~~;

private ~~static~~ ~~Train~~ ~~machineTrain~~;

private ~~static~~ double ~~DocumentResult~~(~~File~~ f, ~~boolean~~ ~~isEconomy~~) {

double ~~negaNum~~ = 0;

~~double~~ ~~posiNum~~ = 0;

double ~~ecoResultNum~~ = 0;

~~double~~ ~~polResultNum~~ = 0;

~~double~~ ~~reslt~~ = 0;

private Trainer[] sectionTrain;

private int notInSectionArticleSum = 0;

private int notInSectionWordSum = 0;

private int notInSectionWordTotalSum = 0;

//자기 Section 이 아닌 내용을 Calculate 하는 함수. Index 에 반응하며 수행시 초기화 후 계산한다.

private void CalculateNotInSection(int index) {

this.notInSectionArticleSum = 0;

for(int i = 0; i < sectionTrain.length; i++) {

if(i != index) { notInSectionArticleSum += sectionTrain[i].getSectionArticleNumber(); }

}

this.notInSectionWordTotalSum = 0;

for(int i = 0; i < sectionTrain.length; i++) {

if(i != index) { notInSectionWordTotalSum += sectionTrain[i].getSectionWordsNumber(); }

}

//해당 단어에 대한 자기 Section 이 아닌 단어수를 Calculate 하는 함수. Index 에 대응하며 수행시 초기화 후 계산한다.

private void CalculateNotInSectionWord(int index, String word) {

this.notInSectionWordSum = 0;

for(int i = 0; i < sectionTrain.length; i++) {

if(i != index) { notInSectionWordSum += sectionTrain[i].getSectionWordNumber(word); }

}

//해당 기사에 대한 연산 결과를 반환하는 함수. 양수일 경우 해당 Index Section 에 일치하는 기사로 판단한 것이며, 음수일 경우 해당 Index Section 에 일치하지 않는 기사라고 판단한 것이다.

private double getWeight(int index, String Article) {

double reslt = getLnPsPns(index);

for(String wordTmp:Article.split("\\s+")) {

reslt += getLnPwsPwns(index, wordTmp);

}

return reslt;

}

// Ln[p(S) / p(!S)] 값을 계산하는 함수. Index 에 대응한다.

private double getLnPsPns(int index) {

return Math.log((double)sectionTrain[index].getSectionArticleNumber() / notInSectionArticleSum);

}

// Sigma Ln[p(Wi ^ S) / p(Wi ^ !S)] 값을 계산하는 함수. Index 에 대응한다. 단 특정 단어에 대한 Advantage 를 부과한다. (Advantage 함수 참조)

private double getLnPwsPwns(int index, String word) {

CalculateNotInSectionWord(index, word);

return Math.log(((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionWordsNumber()) / ((double)ArticleAdvantage(index, word) / notInSectionWordTotalSum));

}

// 특정 단어에 대한 Advantage 부과함수. 해당 Index Section 에만 존재하는 단어일때 빈도에 따른 가산점을 부여한다. 가산은 해당단어수 / Section 전체기사수 * 50 이다.

private double ArticleAdvantage(int index, String word) {

double advantageResult = 0;

for(int i = 0; i < sectionTrain.length; i++) {

if(i != index) {

if(sectionTrain[i].getSectionWordNumber(word) == 1 && sectionTrain[index].getSectionWordNumber(word) > 1) {

advantageResult += (1 - ((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionArticleNumber() * 50));

}

else { advantageResult += sectionTrain[i].getSectionWordNumber(word); }

}

return advantageResult;

}

// 해당 File 변수에 대한 Index Section 과의 매치율을 보여주는 함수. 맞은 것과 틀린것, 그리고 그 것에 대한 판단 확률을 반환한다.

public void DocumentResult(File f, int index) {

int negaNum = 0;

int posiNum = 0;

CalculateNotInSection(index);

try {
Scanner targetDocument = new Scanner(f);
while(targetDocument.hasNextLine()) {

~~String[] a = targetDocument.nextLine().split(" ");~~

~~for(String wordTmp:a) {~~

if(~~ecoData.get~~(~~wordTmp)~~ ~~== null) { ecoResultNum = 0; }~~

~~else { ecoResultNum = ecoData~~.~~get~~(~~wordTmp~~)~~; }~~

~~if(polData.get(wordTmp~~) ~~== null) { polResultNum = 0; }~~

~~else { polResultNum = polData.get(wordTmp); }~~

~~ecoResultNum += 1;~~

~~polResultNum += 1;~~

~~if(isEconomy && polData.get(wordTmp) == null) { polResultNum -= 0.5; } // 경제파트이면서 정치쪽에 없는 단어에 Advantage 부과~~

~~if(!isEconomy && ecoData.get(wordTmp) == null) { ecoResultNum -= 0.5; } // 정치파트이면서 경제쪽에 없는 단어에 Adventage 부과~~

~~if(isEconomy) { reslt += Math.log(ecoResultNum / polResultNum); }~~

~~else { reslt += Math.log(polResultNum / ecoResultNum); }~~

}

~~if(reslt~~ < 0) { negaNum+~~= 1~~; }

else { posiNum +~~= 1~~; }

~~reslt = 0;~~

if(getWeight(index, targetDocument.nextLine()) < 0) { negaNum++; }

else { posiNum++; }

}
targetDocument.close();
} catch (FileNotFoundException e) {

~~// TODO Auto-generated catch block~~

e.printStackTrace();
}

~~return~~ posiNum / (posiNum+negaNum);

System.out.println("Right : " + posiNum + " Wrong : " + negaNum + " Result : " + (getLnPsPns(index) + ((double)posiNum / (posiNum+negaNum))));

}

~~public~~ ~~static~~ ~~void~~ ~~Init~~() {

~~machineTrain~~ = new ~~Train();~~

~~machineTrain~~.~~TrainData()~~;

~~ecoData~~ = ~~machineTrain~~.~~getEconomyData()~~;

~~polData~~ = ~~machineTrain~~.~~getPoliticData~~();

// 생성자. File 갯수에 따라 지정을 해준다.

public Analyzer(File[] dataList) {

this.sectionTrain = new Trainer[dataList.length];

for(int i = 0; i < sectionTrain.length; i++) {

sectionTrain[i] = new Trainer(dataList[i]);

sectionTrain[i].TrainData();

}

}}}

Runner.java

{{{

package org.zeropage.machinelearn;

import java.io.File;

public class Runner {

public static void main(String[] args) {

~~Init();~~

~~double~~ ~~result1~~ = ~~DocumentResult(~~new File("svm_data.tar/package/~~test~~/economy/economy.~~txt~~"), ~~true);~~

~~System~~.~~out~~.~~println(result1~~);

~~double~~ ~~result2~~ = DocumentResult(new File("svm_data.tar/package/test/~~politics~~/~~politics~~.txt"), ~~false~~);

~~System~~.~~out.println~~(~~result2);~~

~~System~~.~~out~~.~~println((result1 + result2~~) ~~/ 2~~);

File[] dbList =

{

new File("svm_data.tar/package/train/economy/index.economy.db"),

new File("svm_data.tar/package/train/politics/index.politics.db"),

};

Analyzer anal = new Analyzer(dbList); // Section 이 두개이니 두개로 저장.

anal.DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), 0);

anal.DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), 1);

}
}
}}}

Train 의 Economy.txt 파일 적중도 : 0.~~995~~ (~~99.5~~%)

Train 의 Politics.txt 파일 적중도 : 0.96 (96%)

전체 평균 적중도 : 0.~~9775~~ (97.75%)

위의 주석처럼 ~~약간의 Advantage 와~~ 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다.

Train 의 Economy.txt 파일 적중도 : 0.83 (83%)

Train 의 Politics.txt 파일 적중도 : 0.94 (94%)

전체 평균 적중도 : 0.885 (88.5%)

위의 주석처럼 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다.

각 단어중 특별한 단어에 Advantage 를 부과했는데, 단어가 해당 Section 에 Unique 하고 그 빈도가 클수록 Advantage를 크게 부과했습니다. (하지만 이도 분석한 Section에 상대적입니다.)

[그래서 그런지 결과엔 0.5% 차이밖에 없더군요..]

이 결과를 볼 수 있었으면 좋겠네요 ^^;;

Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here
Train.java

package org.zeropage.machinelearn;

import java.io.*;
import java.util.*;

class Trainer {
	private Map<String,Integer> sectionWord;
	private int sectionWordNum;
	private int sectionArticleNum;
	private File fileName;
	// 들어온 데이터의 신뢰성을 체크하는 함수. 신뢰성이 없는경우 False 반환
	private boolean isSkipData(String inputStr) { 
		if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") ||
			inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") ||
			inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") || 
			inputStr.equals("//") || inputStr.equals("블로그")) { 
			return true;
		}
		else { return false; }
	}
	// 생성자
	public Trainer(File f) {
		this.fileName = f;
	}
	// Data에 대한 학습 시행
	public void TrainData() {
		this.sectionWordNum = 0;
		this.sectionArticleNum = 0;
		this.sectionWord = new HashMap<String,Integer>();
		try {
			Scanner sectionLearn = new Scanner(this.fileName);
			while(sectionLearn.hasNextLine()) {
				this.sectionArticleNum++;
				String[] a = sectionLearn.nextLine().split("\\s+");
				for(String wordTmp:a) {
					if(isSkipData(wordTmp)) {continue;}			// 1글자Data, 사이트, 블로그, 페이지 주소의 경우 연관성및 신뢰성이 떨어지므로 검색에서 제외
					if( this.sectionWord.get(wordTmp) == null) {	// 해당 단어가 없으면
						this.sectionWord.put(wordTmp, 1);			// 1개로 새로 넣고 추가
					}
					else { this.sectionWord.put(wordTmp, this.sectionWord.get(wordTmp)+1); }	// 있는경우 자신의 것에 갯수를 한개 더 추가
					this.sectionWordNum++;							//Word 수는 중복과 상관없이 갯수를 Count 하므로 무조건 증가시킨다.
				}
			}
			sectionLearn.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
	}
	// 학습된 데이터를 반환
	public HashMap<String,Integer> getSectionData() {
		return (HashMap<String, Integer>) this.sectionWord;
	}
	// 총 단어수를 반환
	public int getSectionWordsNumber() {
		return this.sectionWordNum;
	}
	// 특정 단어의 갯수를 반환, 없는경우 1 반환
	public int getSectionWordNumber(String word) {
		return (sectionWord.get(word) == null) ? 1 : sectionWord.get(word)+1;
	}
	// 전체 기사의 수를 반환
	public int getSectionArticleNumber() {
		return this.sectionArticleNum;
	}
}

Analyzer.java

package org.zeropage.machinelearn;

import java.util.*;
import java.io.*;

public class Analyzer {
	private Trainer[] sectionTrain;
	private int notInSectionArticleSum = 0;
	private int notInSectionWordSum = 0;
	private int notInSectionWordTotalSum = 0;
	//자기 Section 이 아닌 내용을 Calculate 하는 함수. Index 에 반응하며 수행시 초기화 후 계산한다.
	private void CalculateNotInSection(int index) {
		this.notInSectionArticleSum = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) { notInSectionArticleSum += sectionTrain[i].getSectionArticleNumber(); }
		}
		this.notInSectionWordTotalSum = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) { notInSectionWordTotalSum += sectionTrain[i].getSectionWordsNumber(); }
		}
	}
	//해당 단어에 대한 자기 Section 이 아닌 단어수를 Calculate 하는 함수. Index 에 대응하며 수행시 초기화 후 계산한다.
	private void CalculateNotInSectionWord(int index, String word) {
		this.notInSectionWordSum = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) { notInSectionWordSum += sectionTrain[i].getSectionWordNumber(word); }
		}
	}
	//해당 기사에 대한 연산 결과를 반환하는 함수. 양수일 경우 해당 Index Section 에 일치하는 기사로 판단한 것이며, 음수일 경우 해당 Index Section 에 일치하지 않는 기사라고 판단한 것이다.
	private double getWeight(int index, String Article) {
		double reslt = getLnPsPns(index);
		for(String wordTmp:Article.split("\\s+")) {
			reslt += getLnPwsPwns(index, wordTmp);
		}
		return reslt;
	}
	// Ln[p(S) / p(!S)] 값을 계산하는 함수. Index 에 대응한다.
	private double getLnPsPns(int index) {
		return Math.log((double)sectionTrain[index].getSectionArticleNumber() / notInSectionArticleSum);
	}
	// Sigma Ln[p(Wi ^ S) / p(Wi ^ !S)] 값을 계산하는 함수. Index 에 대응한다. 단 특정 단어에 대한 Advantage 를 부과한다. (Advantage 함수 참조)
	private double getLnPwsPwns(int index, String word) {
		CalculateNotInSectionWord(index, word);
		return Math.log(((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionWordsNumber()) / ((double)ArticleAdvantage(index, word) / notInSectionWordTotalSum));
	}
	// 특정 단어에 대한 Advantage 부과함수. 해당 Index Section 에만 존재하는 단어일때 빈도에 따른 가산점을 부여한다. 가산은 해당단어수 / Section 전체기사수 * 50 이다.
	private double ArticleAdvantage(int index, String word) {
		double advantageResult = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) {
				if(sectionTrain[i].getSectionWordNumber(word) == 1 && sectionTrain[index].getSectionWordNumber(word) > 1) {
					advantageResult += (1 - ((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionArticleNumber() * 50));
				}
				else { advantageResult += sectionTrain[i].getSectionWordNumber(word); }
			}
		}
		return advantageResult;
	}
	// 해당 File 변수에 대한 Index Section 과의 매치율을 보여주는 함수. 맞은 것과 틀린것, 그리고 그 것에 대한 판단 확률을 반환한다.
	public void DocumentResult(File f, int index) {
		int negaNum = 0;
		int posiNum = 0;
		CalculateNotInSection(index);
		try {
			Scanner targetDocument = new Scanner(f);
			while(targetDocument.hasNextLine()) {
				if(getWeight(index, targetDocument.nextLine()) < 0) { negaNum++; }
				else { posiNum++; }
			}
			targetDocument.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		System.out.println("Right : " + posiNum + " Wrong : " + negaNum + " Result : " + (getLnPsPns(index) + ((double)posiNum / (posiNum+negaNum))));
	}
	// 생성자. File 갯수에 따라 지정을 해준다.
	public Analyzer(File[] dataList) {
		this.sectionTrain = new Trainer[dataList.length];
		for(int i = 0; i < sectionTrain.length; i++) {
			sectionTrain[i] = new Trainer(dataList[i]);
			sectionTrain[i].TrainData();
		}
	}
}

Runner.java

package org.zeropage.machinelearn;

import java.io.File;

public class Runner {
	public static void main(String[] args) {
		File[] dbList = 
		{
			new File("svm_data.tar/package/train/economy/index.economy.db"),
			new File("svm_data.tar/package/train/politics/index.politics.db"),
		};
		Analyzer anal = new Analyzer(dbList);	// Section 이 두개이니 두개로 저장.
		anal.DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), 0);
		anal.DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), 1);
	}
}

Train 의 Economy.txt 파일 적중도 : 0.83 (83%)
Train 의 Politics.txt 파일 적중도 : 0.94 (94%)
전체 평균 적중도 : 0.885 (88.5%)
위의 주석처럼 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다.
각 단어중 특별한 단어에 Advantage 를 부과했는데, 단어가 해당 Section 에 Unique 하고 그 빈도가 클수록 Advantage를 크게 부과했습니다. (하지만 이도 분석한 Section에 상대적입니다.)
그래서 그런지 결과엔 0.5% 차이밖에 없더군요..
이 결과를 볼 수 있었으면 좋겠네요 ^^;;