Runner.java ¶
/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package us.linfl.ml; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author Linflus */ public class Runner { /** * @param args the command line arguments */ public static void main(String[] args) { BufferedReader reader = null; String path = "C:\\Users\\Linflus\\CAU\\ZeroPage\\2011\\Devils Camp\\trunk\\DocumentClassification\\"; try { // TODO code application logic here WordsTable words = new WordsTable(); words.setFile(path + "index.economy.db"); words.readFile(WordsTable.Type.ECONOMY); words.setFile(path + "index.politics.db"); words.readFile(WordsTable.Type.POLITICS); reader = new BufferedReader(new FileReader(path + "politics.txt")); NaiveBayes nb; String line; while((line = reader.readLine()) != null){ nb = new NaiveBayes(reader.readLine(), words); System.out.println(nb.classify()); } System.out.println((float)NaiveBayes.p/(NaiveBayes.e+NaiveBayes.p)); } catch (IOException ex) { Logger.getLogger(Runner.class.getName()).log(Level.SEVERE, null, ex); } } }
NaiveBayes.java ¶
/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package us.linfl.ml; /** * * @author Linflus */ public class NaiveBayes { public static int e = 0; public static int p = 0; WordsTable table; String[] words; public NaiveBayes(String document, WordsTable table) { words = document.split("\\s+"); this.table = table; } public double calcDocProb() { float pd = (float)table.ecoN / (table.ecoN + table.poliN); if (pd == 1 || pd == 0) { return 0; } return Math.log(pd / (1 - pd)); } public WordsTable.Type classify() { double pw = 0; String key; for (int i = 0; i < words.length; i++) { key = words[i]; if (table.poliWords.containsKey(key) && table.ecoWords.containsKey(key)) { if (table.poliWords.get(key) != 0) { pw += Math.log((float)table.ecoWords.get(key) / (table.poliWords.get(key))); } } } if (calcDocProb() + pw > 0) { e++; return WordsTable.Type.ECONOMY; } else { p++; return WordsTable.Type.POLITICS; } } }
WordsTable.java ¶
/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package us.linfl.ml; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author Linflus */ public class WordsTable { String filename; Type filetype; Map<String, Integer> ecoWords = new HashMap(); Map<String, Integer> poliWords = new HashMap(); int ecoN = 0, poliN = 0; public void setFile(String filename) { this.filename = filename; } public int[] getCount(String word) { int[] count = {}; count[0] = ecoWords.get(word); count[1] = poliWords.get(word); return count; } public void readFile(Type filetype) { try { String line; BufferedReader reader = new BufferedReader(new FileReader(filename)); while((line = reader.readLine()) != null){ parse(line, filetype); } } catch (IOException ex) { Logger.getLogger(WordsTable.class.getName()).log(Level.SEVERE, null, ex); } } private void parse(String document, Type filetype) { String[] words = document.split("\\s+"); Map<String, Integer> typedWords = null; if(filetype == Type.ECONOMY){ typedWords = ecoWords; ecoN++; }else if(filetype == Type.POLITICS){ typedWords = poliWords; poliN++; } String key = null; for(int i=0; i<words.length; i++) { key = words[i]; if(typedWords.containsKey(key)){ typedWords.put(key, typedWords.get(key) + 1); }else{ typedWords.put(key, 1); } } } public enum Type { ECONOMY, POLITICS } }