데블스캠프2011/둘째날/Machine-Learning/Naive Bayes Classifier/김동준

Describe 데블��2011/둘��날/Machine-Learning/NaiveBayesClassifier/김동�� here
Train.java

package org.zeropage.machinelearn;

import java.io.*;
import java.util.*;

class Trainer {
	private Map<String,Integer> sectionWord;
	private int sectionWordNum;
	private int sectionArticleNum;
	private File fileName;
	// 들������ 데��������� ���뢰������ ���������는 ������. ���뢰������ ���는경��� False 반���
	private boolean isSkipData(String inputStr) { 
		if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") ||
			inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") ||
			inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") || 
			inputStr.equals("//") || inputStr.equals("블로그")) { 
			return true;
		}
		else { return false; }
	}
	// ���������
	public Trainer(File f) {
		this.fileName = f;
	}
	// Data��� 대��� ������ ������
	public void TrainData() {
		this.sectionWordNum = 0;
		this.sectionArticleNum = 0;
		this.sectionWord = new HashMap<String,Integer>();
		try {
			Scanner sectionLearn = new Scanner(this.fileName);
			while(sectionLearn.hasNextLine()) {
				this.sectionArticleNum++;
				String[] a = sectionLearn.nextLine().split("\\s+");
				for(String wordTmp:a) {
					if(isSkipData(wordTmp)) {continue;}			// 1글���Data, ���������, 블로그, ��������� ��������� 경��� ���관���및 ���뢰������ 떨������므로 검��������� ������
					if( this.sectionWord.get(wordTmp) == null) {	// ���당 단���가 ������면
						this.sectionWord.put(wordTmp, 1);			// 1개로 ���로 ���고 ���가
					}
					else { this.sectionWord.put(wordTmp, this.sectionWord.get(wordTmp)+1); }	// ���는경��� ��������� 것��� 갯���를 ���개 더 ���가
					this.sectionWordNum++;							//Word ���는 ���복과 ���관������ 갯���를 Count ���므로 무������ ���가������다.
				}
			}
			sectionLearn.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
	}
	// ������된 데������를 반���
	public HashMap<String,Integer> getSectionData() {
		return (HashMap<String, Integer>) this.sectionWord;
	}
	// ��� 단������를 반���
	public int getSectionWordsNumber() {
		return this.sectionWordNum;
	}
	// ������ 단������ 갯���를 반���, ���는경��� 1 반���
	public int getSectionWordNumber(String word) {
		return (sectionWord.get(word) == null) ? 1 : sectionWord.get(word)+1;
	}
	// ������ 기������ ���를 반���
	public int getSectionArticleNumber() {
		return this.sectionArticleNum;
	}
}

Analyzer.java

package org.zeropage.machinelearn;

import java.util.*;
import java.io.*;

public class Analyzer {
	private Trainer[] sectionTrain;
	private int notInSectionArticleSum = 0;
	private int notInSectionWordSum = 0;
	private int notInSectionWordTotalSum = 0;
	//���기 Section ��� ���닌 내������ Calculate ���는 ������. Index ��� 반������며 ��������� ���기��� ��� ��������다.
	private void CalculateNotInSection(int index) {
		this.notInSectionArticleSum = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) { notInSectionArticleSum += sectionTrain[i].getSectionArticleNumber(); }
		}
		this.notInSectionWordTotalSum = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) { notInSectionWordTotalSum += sectionTrain[i].getSectionWordsNumber(); }
		}
	}
	//���당 단������ 대��� ���기 Section ��� ���닌 단������를 Calculate ���는 ������. Index ��� 대������며 ��������� ���기��� ��� ��������다.
	private void CalculateNotInSectionWord(int index, String word) {
		this.notInSectionWordSum = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) { notInSectionWordSum += sectionTrain[i].getSectionWordNumber(word); }
		}
	}
	//���당 기������ 대��� ������ 결과를 반������는 ������. ��������� 경��� ���당 Index Section ��� ���������는 기���로 ���단��� 것���며, ��������� 경��� ���당 Index Section ��� ������������ ���는 기���라고 ���단��� 것���다.
	private double getWeight(int index, String Article) {
		double reslt = getLnPsPns(index);
		for(String wordTmp:Article.split("\\s+")) {
			reslt += getLnPwsPwns(index, wordTmp);
		}
		return reslt;
	}
	// Ln[p(S) / p(!S)] 값��� ��������는 ������. Index ��� 대������다.
	private double getLnPsPns(int index) {
		return Math.log((double)sectionTrain[index].getSectionArticleNumber() / notInSectionArticleSum);
	}
	// Sigma Ln[p(Wi ^ S) / p(Wi ^ !S)] 값��� ��������는 ������. Index ��� 대������다. 단 ������ 단������ 대��� Advantage 를 부과���다. (Advantage ������ ������)
	private double getLnPwsPwns(int index, String word) {
		CalculateNotInSectionWord(index, word);
		return Math.log(((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionWordsNumber()) / ((double)ArticleAdvantage(index, word) / notInSectionWordTotalSum));
	}
	// ������ 단������ 대��� Advantage 부과������. ���당 Index Section ���만 ���������는 단������때 ������� 따른 가��������� 부������다. 가������ ���당단������ / Section ������기������ * 50 ���다.
	private double ArticleAdvantage(int index, String word) {
		double advantageResult = 0;
		for(int i = 0; i < sectionTrain.length; i++) {
			if(i != index) {
				if(sectionTrain[i].getSectionWordNumber(word) == 1 && sectionTrain[index].getSectionWordNumber(word) > 1) {
					advantageResult += (1 - ((double)sectionTrain[index].getSectionWordNumber(word) / sectionTrain[index].getSectionArticleNumber() * 50));
				}
				else { advantageResult += sectionTrain[i].getSectionWordNumber(word); }
			}
		}
		return advantageResult;
	}
	// ���당 File 변������ 대��� Index Section 과��� 매��������� 보������는 ������. 맞��� 것과 ���린것, 그리고 그 것��� 대��� ���단 ���률��� 반������다.
	public void DocumentResult(File f, int index) {
		int negaNum = 0;
		int posiNum = 0;
		CalculateNotInSection(index);
		try {
			Scanner targetDocument = new Scanner(f);
			while(targetDocument.hasNextLine()) {
				if(getWeight(index, targetDocument.nextLine()) < 0) { negaNum++; }
				else { posiNum++; }
			}
			targetDocument.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		System.out.println("Right : " + posiNum + " Wrong : " + negaNum + " Result : " + (getLnPsPns(index) + ((double)posiNum / (posiNum+negaNum))));
	}
	// ���������. File 갯������ 따라 ��������� ������다.
	public Analyzer(File[] dataList) {
		this.sectionTrain = new Trainer[dataList.length];
		for(int i = 0; i < sectionTrain.length; i++) {
			sectionTrain[i] = new Trainer(dataList[i]);
			sectionTrain[i].TrainData();
		}
	}
}

Runner.java

package org.zeropage.machinelearn;

import java.io.File;

public class Runner {
	public static void main(String[] args) {
		File[] dbList = 
		{
			new File("svm_data.tar/package/train/economy/index.economy.db"),
			new File("svm_data.tar/package/train/politics/index.politics.db"),
		};
		Analyzer anal = new Analyzer(dbList);	// Section ��� 두개����� 두개로 ������.
		anal.DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), 0);
		anal.DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), 1);
	}
}

Train �� Economy.txt �� : 0.83 (83%)
Train �� Politics.txt �� : 0.94 (94%)
�� 균 �� : 0.885 (88.5%)
�� 럼 ��는 (http, //, blog, yahoo, empas, tistory 같��) 단��를 ��고 �� 게 ��다.
각 단�� 단�� Advantage 를 부과��는데, 단��가 ��당 Section �� Unique ��고 그 ��가 ��록 Advantage를 ��게 부과��다. (��만 �� Section�� 대��다.)
그래�� 그런�� 결과�� 0.5% ��밖�� 더군��..
�� 결과를 볼 �� 면 ��겠�� ^^;;