From d716be16bb2be245d3f917482cfbd89ec8d21f76 Mon Sep 17 00:00:00 2001 From: Xiaodong Yu Date: Mon, 1 Oct 2018 16:55:18 -0500 Subject: [PATCH 1/2] fix parameter bug --- .../cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java index b8dee1ccc..3a00f5b71 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java @@ -73,7 +73,7 @@ public static void buildFinalModel(int fixedNumIterations, String trainDataPath, */ public static void getLearningCurve(int fixedNumIterations, String trainDataPath, String testDataPath, boolean incremental) throws Exception { - getLearningCurve(fixedNumIterations, trainDataPath, "-c", testDataPath, incremental); + getLearningCurve(fixedNumIterations, "-c", trainDataPath, testDataPath, incremental); } /** From ea7e1247a98877a8edae86ee477c520be35bf06c Mon Sep 17 00:00:00 2001 From: Xiaodong Yu Date: Sun, 28 Oct 2018 20:11:21 -0500 Subject: [PATCH 2/2] change CLM class --- .../CharacterLanguageModel.java | 104 +++++++++++++++++- 1 file changed, 99 insertions(+), 5 deletions(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/CharacterLanguageModel.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/CharacterLanguageModel.java index f5c5cd538..e543ee18e 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/CharacterLanguageModel.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/CharacterLanguageModel.java @@ -1,11 +1,17 @@ package edu.illinois.cs.cogcomp.ner.ExpressiveFeatures; +import edu.illinois.cs.cogcomp.ner.IO.InFile; import edu.illinois.cs.cogcomp.core.datastructures.Pair; import edu.illinois.cs.cogcomp.core.io.LineIO; import edu.illinois.cs.cogcomp.core.utilities.StringUtils; import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; import edu.illinois.cs.cogcomp.ner.LbjTagger.*; +import gnu.trove.map.hash.THashMap; + +import java.io.InputStream; +import java.io.FileInputStream; + import javax.annotation.Resource; import java.io.File; @@ -18,6 +24,7 @@ public class CharacterLanguageModel { private HashMap> counts; private int order; private String pad = "_"; + private static THashMap charlms = new THashMap<>(); public CharacterLanguageModel(){ // parameterized how? order of ngrams? @@ -30,6 +37,9 @@ public CharacterLanguageModel(){ order = 6; } + static public CharacterLanguageModel getLM(String key) { + return charlms.get(key); + } /** * Actually returns the log perplexity. @@ -241,6 +251,73 @@ public static void test() throws FileNotFoundException { } + + public static void test(CharacterLanguageModel eclm, CharacterLanguageModel neclm, Data testData) throws IOException { + + double correct = 0; + double total = 0; + List outpreds = new ArrayList<>(); + for(NERDocument doc : testData.documents){ + for(LinkedVector sentence : doc.sentences){ + for(int i = 0; i < sentence.size(); i++) { + NEWord word = (NEWord) sentence.get(i); + String label = word.neLabel.equals("O")? "O" : "B-ENT"; + double eppl = eclm.perplexity(string2list(word.form)); + double neppl = neclm.perplexity(string2list(word.form)); + + String pred; + + if(word.form.length() < 3){ + pred = "O"; + }else if(eppl < neppl){ + pred = "B-ENT"; + }else{ + pred = "O"; + } + + if (pred.equals(label)){ + //System.out.println(word.form + ": correct"); + correct += 1; + }else{ + System.out.println(word.form + ": WRONG***"); + } + total +=1; + + outpreds.add(word.form + " " + label + " " + pred); + } + outpreds.add(""); + } + } + + System.out.println("Accuracy: " + correct / total); + + LineIO.write("pred.txt", outpreds); + System.out.println("Wrote to pred.txt. Now run $ conlleval pred.txt to get F1 scores."); + + + } + + + + public static List> readList(String path) { + + List> data = new ArrayList<>(); + try { + InputStream is = new FileInputStream(path); + InFile in = new InFile(is); + String line = in.readLine(); + while (line != null) { + List splited = new ArrayList<>(Arrays.asList(line.replace("\n","").split(" "))); + data.add(splited); + line = in.readLine(); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + return data; + } + + public static void main(String[] args) throws Exception { // this trains models, and provides perplexities. //test(); @@ -248,16 +325,33 @@ public static void main(String[] args) throws Exception { ParametersForLbjCode params = Parameters.readConfigAndLoadExternalData("config/ner.properties", false); // String trainpath= "/shared/corpora/ner/conll2003/eng-files/Train-json/"; -// String testpath = "/shared/corpora/ner/conll2003/eng-files/Test-json/"; + String testpath = "/shared/corpora/ner/conll2003/eng-files/Test-json/"; + +// String trainpath= "/shared/corpora/ner/lorelei-swm-new/ben/Train/"; +// String testpath = "/shared/corpora/ner/lorelei-swm-new/ben/Test/"; - String trainpath= "/shared/corpora/ner/lorelei-swm-new/ben/Train/"; - String testpath = "/shared/corpora/ner/lorelei-swm-new/ben/Test/"; + System.out.println("Reading List"); + String wiki_ent_file = "/shared/corpora/ner/clm/wikiEntity_train.out"; + String wiki_nonent_file = "/shared/corpora/ner/clm/wikiNotEntity_train.out"; + List> wiki_ent = CharacterLanguageModel.readList(wiki_ent_file); + List> wiki_non_ent = CharacterLanguageModel.readList(wiki_nonent_file); + + System.out.println("train entity clm"); + CharacterLanguageModel eclm = new CharacterLanguageModel(); + eclm.train(wiki_ent); - Data trainData = new Data(trainpath, trainpath, "-json", new String[] {}, new String[] {}, params); + System.out.println("train non entity clm"); + CharacterLanguageModel neclm = new CharacterLanguageModel(); + neclm.train(wiki_non_ent); + + System.out.println("Testing"); +// Data trainData = new Data(trainpath, trainpath, "-json", new String[] {}, new String[] {}, params); Data testData = new Data(testpath, testpath, "-json", new String[] {}, new String[] {}, params); + CharacterLanguageModel.test(eclm, neclm, testData); + - trainEntityNotEntity(trainData, testData); +// trainEntityNotEntity(trainData, testData); }