From 6f97aa10d19c7aa3d56d78d2ed0d6621e428729a Mon Sep 17 00:00:00 2001 From: allanjie Date: Sun, 4 Jun 2017 18:45:50 +0800 Subject: [PATCH 01/16] using stanford corenlp 3.7.0, dependency and PCFG parser --- pom.xml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 6393d1f..b842500 100644 --- a/pom.xml +++ b/pom.xml @@ -108,7 +108,7 @@ edu.stanford.nlp stanford-corenlp - 3.6.0 + 3.7.0 slf4j-api @@ -123,7 +123,7 @@ edu.stanford.nlp stanford-corenlp - 3.6.0 + 3.7.0 models runtime @@ -330,6 +330,8 @@ edu.stanford.nlp:stanford-corenlp:jar:models:* **/pos-tagger/**/* + **/parser/**/* + **/lexparser/**/* From 90999b1858bbff70e58a0f10a9bb46cc925c761b Mon Sep 17 00:00:00 2001 From: allanjie Date: Sun, 4 Jun 2017 20:37:55 +0800 Subject: [PATCH 02/16] adding the depParser and tree parser --- .../nlp/reader/acereader/ACEReader.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index cd6d14c..5e69e35 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -26,6 +26,8 @@ import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.StringUtils; +import justhalf.nlp.depparser.DepParser; +import justhalf.nlp.depparser.StanfordDepParser; import justhalf.nlp.postagger.POSTagger; import justhalf.nlp.postagger.StanfordPOSTagger; import justhalf.nlp.reader.acereader.ACEEntity.ACEEntitySubType; @@ -36,6 +38,8 @@ import justhalf.nlp.reader.acereader.ACERelation.ACERelationType; import justhalf.nlp.reader.acereader.ACEValue.ACEValueSubType; import justhalf.nlp.reader.acereader.ACEValue.ACEValueType; +import justhalf.nlp.sentenceparser.SentenceParser; +import justhalf.nlp.sentenceparser.StanfordSentenceParser; import justhalf.nlp.sentencesplitter.SentenceSplitter; import justhalf.nlp.sentencesplitter.StanfordSentenceSplitter; import justhalf.nlp.tokenizer.RegexTokenizer; @@ -72,9 +76,14 @@ public static void main(String[] args) throws FileNotFoundException{ boolean tokenize = false; boolean posTag = false; + boolean parse = false; + boolean dep = false; Tokenizer tokenizer = null; POSTagger posTagger = null; SentenceSplitter splitter = null; + SentenceParser parser = null; + DepParser depParser = null; + boolean toCoNLL = false; boolean ignoreOverlaps = false; @@ -181,6 +190,33 @@ public static void main(String[] args) throws FileNotFoundException{ } argIndex += 2; break; + case "-parser": + parse = true; + switch(args[argIndex+1]){ + case "stanford": + parser = new StanfordSentenceParser(); + break; + default: + System.out.println("Unrecognized sentence parser \""+args[argIndex+1]+"\", using stanford."); + parser = new StanfordSentenceParser(); + break; + } + argIndex += 2; + break; + case "-depparser": + dep = true; + switch(args[argIndex+1]){ + case "stanford": + //by default it's UD parser + depParser = new StanfordDepParser(); + break; + default: + System.out.println("Unrecognized dependency parser \""+args[argIndex+1]+"\", using stanford."); + depParser = new StanfordDepParser(); + break; + } + argIndex += 2; + break; case "-splitter": switch(args[argIndex+1]){ case "stanford": From 15c93f02862a454835d2e50da0555d96607fabd1 Mon Sep 17 00:00:00 2001 From: allanjie Date: Sun, 4 Jun 2017 23:52:01 +0800 Subject: [PATCH 03/16] add the dependency parse result from stanford corenlp (not test yet) --- .../nlp/depparser/StanfordDepParser.java | 3 +- .../nlp/reader/acereader/ACEReader.java | 45 ++++++++++++++++--- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java b/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java index 7b10cd8..559ac49 100644 --- a/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java +++ b/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java @@ -7,6 +7,7 @@ import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.parser.nndep.DependencyParser; import edu.stanford.nlp.trees.GrammaticalStructure; +import edu.stanford.nlp.trees.GrammaticalStructure.Extras; import edu.stanford.nlp.trees.TypedDependency; /** @@ -53,7 +54,7 @@ public boolean isThreadSafe() { public List parse(List sentence) { check(sentence); GrammaticalStructure structure = dependencyParser.predict(sentence); - return structure.typedDependenciesCCprocessed(); + return structure.typedDependencies(Extras.NONE); } private void check(List sentence){ diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 5e69e35..3ad6ade 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -25,6 +25,7 @@ import org.xml.sax.SAXException; import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.util.StringUtils; import justhalf.nlp.depparser.DepParser; import justhalf.nlp.depparser.StanfordDepParser; @@ -449,20 +450,20 @@ public static void main(String[] args) throws FileNotFoundException{ if(ace2004Docs.size() > 0){ System.out.println("Printing ACE2004 dataset to "+ace2004OutputDir+"/{train,dev,test}.data"); printDataset(ace2004OutputDir, ace2004Docs, datasplit, printEntities, printRelations, - (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, splitter, + (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); } if(ace2005Docs.size() > 0){ System.out.println("Printing ACE2005 dataset to "+ace2005OutputDir+"/{train,dev,test}.data"); printDataset(ace2005OutputDir, ace2005Docs, datasplit, printEntities, printRelations, - (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, + (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); } } } private static void printDataset(String outputDir, List docs, double[] datasplit, - boolean printEntities, boolean printRelations, Tokenizer tokenizer, POSTagger posTagger, + boolean printEntities, boolean printRelations, Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, SentenceSplitter splitter, boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, boolean splitByDocument, boolean shuffle, int shuffleSeed) throws FileNotFoundException { List trainSentences = new ArrayList(); @@ -483,9 +484,9 @@ private static void printDataset(String outputDir, List docs, doubl testSentences = new ArrayList(); splitData(aceSentences, trainSentences, devSentences, testSentences, datasplit, shuffle, shuffleSeed); } - writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, printEntities, printRelations, toCoNLL, useBILOU); - writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, printEntities, printRelations, toCoNLL, useBILOU); - writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, printEntities, printRelations, toCoNLL, useBILOU); + writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, depParser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, depParser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, depParser, printEntities, printRelations, toCoNLL, useBILOU); } /** @@ -795,7 +796,7 @@ private static > List sorted(Collection coll){ } private static void writeData(List sentences, String outputDir, String name, - Tokenizer tokenizer, POSTagger posTagger, boolean printEntities, boolean printRelations, + Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, boolean printEntities, boolean printRelations, boolean toCoNLL, boolean useBILOU) throws FileNotFoundException{ PrintWriter printer = new PrintWriter(new File(outputDir+name)); for(ACESentence sentence: sentences){ @@ -804,6 +805,18 @@ private static void writeData(List sentences, String outputDir, Str if(posTagger != null){ posTagger.tagCoreLabels(tokens); } + int[] heads = null; + String[] depLabels = null; + if (depParser != null && depParser instanceof StanfordDepParser) { + heads = new int[tokens.size()]; + depLabels = new String[tokens.size()]; + List deps = depParser.parse(tokens); + for (int i = 0; i < deps.size(); i++) { + TypedDependency typedDep = deps.get(i); + heads[typedDep.dep().index() - 1] = typedDep.gov().index() - 1; + depLabels[typedDep.dep().index() - 1] = typedDep.reln().getShortName(); + } + } if(toCoNLL){ List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU); if(posTagger != null){ @@ -844,6 +857,24 @@ private static void writeData(List sentences, String outputDir, Str } printer.println(stringBuilder.toString()); } + if (heads != null && depLabels !=null) { + stringBuilder = new StringBuilder(); + for (int i = 0; i < heads.length; i++) { + if(stringBuilder.length() > 0){ + stringBuilder.append(" "); + } + stringBuilder.append(heads[i]+""); + } + printer.println(stringBuilder.toString()); + stringBuilder = new StringBuilder(); + for (int i = 0; i < depLabels.length; i++) { + if(stringBuilder.length() > 0){ + stringBuilder.append(" "); + } + stringBuilder.append(depLabels[i]); + } + printer.println(stringBuilder.toString()); + } if(printEntities){ stringBuilder = new StringBuilder(); for(ACEEntityMention mention: sentence.entities){ From 8d831c74f82b4ef439c1bd2b19d4b28aac2e88c6 Mon Sep 17 00:00:00 2001 From: allanjie Date: Mon, 5 Jun 2017 11:27:32 +0800 Subject: [PATCH 04/16] adding the mention type, dependency information and parse tree information using stanford --- .../nlp/reader/acereader/ACEReader.java | 33 ++++++++++++------- .../nlp/sentenceparser/SentenceParser.java | 10 ++++++ .../StanfordSentenceParser.java | 6 ++++ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 3ad6ade..8a68cb0 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -25,6 +25,7 @@ import org.xml.sax.SAXException; import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.util.StringUtils; import justhalf.nlp.depparser.DepParser; @@ -450,13 +451,13 @@ public static void main(String[] args) throws FileNotFoundException{ if(ace2004Docs.size() > 0){ System.out.println("Printing ACE2004 dataset to "+ace2004OutputDir+"/{train,dev,test}.data"); printDataset(ace2004OutputDir, ace2004Docs, datasplit, printEntities, printRelations, - (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, splitter, - toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); + (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, + splitter,toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); } if(ace2005Docs.size() > 0){ System.out.println("Printing ACE2005 dataset to "+ace2005OutputDir+"/{train,dev,test}.data"); printDataset(ace2005OutputDir, ace2005Docs, datasplit, printEntities, printRelations, - (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, + (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); } } @@ -464,7 +465,7 @@ public static void main(String[] args) throws FileNotFoundException{ private static void printDataset(String outputDir, List docs, double[] datasplit, boolean printEntities, boolean printRelations, Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, - SentenceSplitter splitter, boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, + SentenceParser parser, SentenceSplitter splitter, boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, boolean splitByDocument, boolean shuffle, int shuffleSeed) throws FileNotFoundException { List trainSentences = new ArrayList(); List devSentences = new ArrayList(); @@ -484,9 +485,9 @@ private static void printDataset(String outputDir, List docs, doubl testSentences = new ArrayList(); splitData(aceSentences, trainSentences, devSentences, testSentences, datasplit, shuffle, shuffleSeed); } - writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, depParser, printEntities, printRelations, toCoNLL, useBILOU); - writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, depParser, printEntities, printRelations, toCoNLL, useBILOU); - writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, depParser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); } /** @@ -796,7 +797,7 @@ private static > List sorted(Collection coll){ } private static void writeData(List sentences, String outputDir, String name, - Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, boolean printEntities, boolean printRelations, + Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, SentenceParser parser, boolean printEntities, boolean printRelations, boolean toCoNLL, boolean useBILOU) throws FileNotFoundException{ PrintWriter printer = new PrintWriter(new File(outputDir+name)); for(ACESentence sentence: sentences){ @@ -817,6 +818,10 @@ private static void writeData(List sentences, String outputDir, Str depLabels[typedDep.dep().index() - 1] = typedDep.reln().getShortName(); } } + Tree parseTree = null; + if (parser != null) { + parseTree = parser.parseCoreLabel(tokens); + } if(toCoNLL){ List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU); if(posTagger != null){ @@ -857,6 +862,12 @@ private static void writeData(List sentences, String outputDir, Str } printer.println(stringBuilder.toString()); } + if (posTagger != null && parser != null) { + //Note: the line above parser string must contain something. otherwise the penn reader later have error. + stringBuilder = new StringBuilder(); + stringBuilder = parseTree.toStringBuilder(stringBuilder); + printer.println(stringBuilder.toString()); + } if (heads != null && depLabels !=null) { stringBuilder = new StringBuilder(); for (int i = 0; i < heads.length; i++) { @@ -883,7 +894,7 @@ private static void writeData(List sentences, String outputDir, Str if(stringBuilder.length() > 0){ stringBuilder.append("|"); } - stringBuilder.append(String.format("%s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form)); + stringBuilder.append(String.format("%s,%s,%s,%s %s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name())); } printer.println(stringBuilder.toString()); } @@ -898,12 +909,11 @@ private static void writeData(List sentences, String outputDir, Str for(ACEEntityMention mention: relation.args){ Span span = findWordSpan(mention.span, tokens); Span headSpan = findWordSpan(mention.headSpan, tokens); - stringBuilder.append(String.format(" %s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form)); + stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name())); } } printer.println(stringBuilder.toString()); } - printer.println(); } } else { @@ -1079,6 +1089,7 @@ public static List readDocuments(String ace2004DirName, String ace2 private static void extractDocList(List fileList, String aceDirName, Collection aceDomains, String... additionalPath) { File aceDir = new File(aceDirName); + System.err.println(aceDirName); for(File subdir: aceDir.listFiles()){ if(!subdir.isDirectory()){ continue; diff --git a/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java b/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java index 1288784..964a976 100644 --- a/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java +++ b/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java @@ -2,6 +2,7 @@ import java.util.List; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.trees.Tree; import justhalf.nlp.NLPInterface; @@ -27,4 +28,13 @@ public interface SentenceParser extends NLPInterface{ * The parsed sentence */ public Tree parse(List sentence); + + /** + * Parse pretagged sentence, returning a Tree. + * @param sentence + * The sentence to be parsed as a list of tokens and also labels + * @return the parsed tree + */ + public Tree parseCoreLabel(List sentence); + } diff --git a/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java b/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java index fe22f29..cd784ab 100644 --- a/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java +++ b/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java @@ -2,6 +2,7 @@ import java.util.List; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.trees.Tree; @@ -29,6 +30,11 @@ public Tree parse(List sentence) { return parser.parseStrings(sentence); } + @Override + public Tree parseCoreLabel(List sentence) { + return parser.parse(sentence); + } + @Override public boolean isThreadSafe(){ return true; From bc742a29287f8187a0933d0aca25d2f0074a2f9c Mon Sep 17 00:00:00 2001 From: allanjie Date: Mon, 5 Jun 2017 11:29:05 +0800 Subject: [PATCH 05/16] remove useless println --- src/main/java/justhalf/nlp/reader/acereader/ACEReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 8a68cb0..2ceb07e 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -1089,7 +1089,6 @@ public static List readDocuments(String ace2004DirName, String ace2 private static void extractDocList(List fileList, String aceDirName, Collection aceDomains, String... additionalPath) { File aceDir = new File(aceDirName); - System.err.println(aceDirName); for(File subdir: aceDir.listFiles()){ if(!subdir.isDirectory()){ continue; From 4e0de57d5b0a4d71f7ad10695e8118bd6e6d7ba2 Mon Sep 17 00:00:00 2001 From: allanjie Date: Mon, 5 Jun 2017 14:00:48 +0800 Subject: [PATCH 06/16] add the split folds option to split the data into a few folds --- .../nlp/reader/acereader/ACEReader.java | 81 +++++++++++++++---- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 2ceb07e..bacd2df 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -70,6 +70,7 @@ public static void main(String[] args) throws FileNotFoundException{ HashSet ace2005Domains = new LinkedHashSet(ACE2005_DOMAINS); double[] datasplit = null; + int foldNum = 0; //use when split fold boolean print = false; boolean printEntities = false; boolean printRelations = false; @@ -91,6 +92,7 @@ public static void main(String[] args) throws FileNotFoundException{ boolean ignoreOverlaps = false; boolean useBILOU = false; boolean splitByDocument = true; + boolean splitFolds = false; boolean shuffle = false; boolean excludeMetadata = false; int shuffleSeed = 31; @@ -247,6 +249,11 @@ public static void main(String[] args) throws FileNotFoundException{ splitByDocument = false; argIndex += 1; break; + case "-splitFolds": + splitFolds = true; //note: careful about the split. + foldNum = Integer.valueOf(args[argIndex + 1]); + argIndex += 2; + break; case "-shuffle": shuffle = true; argIndex += 1; @@ -280,7 +287,7 @@ public static void main(String[] args) throws FileNotFoundException{ printHelp("Please specify the output directory for ACE2005."); System.exit(0); } - if(datasplit == null){ + if(datasplit == null && foldNum == 0){ printHelp("Please specify the datasplit with -dataSplit option."); System.exit(0); } @@ -450,34 +457,43 @@ public static void main(String[] args) throws FileNotFoundException{ if(print){ if(ace2004Docs.size() > 0){ System.out.println("Printing ACE2004 dataset to "+ace2004OutputDir+"/{train,dev,test}.data"); - printDataset(ace2004OutputDir, ace2004Docs, datasplit, printEntities, printRelations, + printDataset(ace2004OutputDir, ace2004Docs, datasplit, foldNum, printEntities, printRelations, (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, - splitter,toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); + splitter,toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, splitFolds, shuffle, shuffleSeed); } if(ace2005Docs.size() > 0){ System.out.println("Printing ACE2005 dataset to "+ace2005OutputDir+"/{train,dev,test}.data"); - printDataset(ace2005OutputDir, ace2005Docs, datasplit, printEntities, printRelations, + printDataset(ace2005OutputDir, ace2005Docs, datasplit, foldNum, printEntities, printRelations, (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, - splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); + splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, splitFolds, shuffle, shuffleSeed); } } } - private static void printDataset(String outputDir, List docs, double[] datasplit, + private static void printDataset(String outputDir, List docs, double[] datasplit, int foldNum, boolean printEntities, boolean printRelations, Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, SentenceParser parser, SentenceSplitter splitter, boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, - boolean splitByDocument, boolean shuffle, int shuffleSeed) throws FileNotFoundException { + boolean splitByDocument, boolean splitFolds, boolean shuffle, int shuffleSeed) throws FileNotFoundException { List trainSentences = new ArrayList(); List devSentences = new ArrayList(); List testSentences = new ArrayList(); + List> foldSentences = new ArrayList<>(); if(splitByDocument){ - List trainDocs = new ArrayList(); - List devDocs = new ArrayList(); - List testDocs = new ArrayList(); - splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed); - trainSentences = getSentences(trainDocs, splitter, ignoreOverlaps); - devSentences = getSentences(devDocs, splitter, ignoreOverlaps); - testSentences = getSentences(testDocs, splitter, ignoreOverlaps); + if (splitFolds) { + List> foldDocs = new ArrayList<>(); + splitFolds(docs, foldDocs, foldNum, shuffle, shuffleSeed); + for (int i = 0; i < foldDocs.size(); i++) { + foldSentences.add(getSentences(foldDocs.get(i), splitter, ignoreOverlaps)); + } + } else { + List trainDocs = new ArrayList(); + List devDocs = new ArrayList(); + List testDocs = new ArrayList(); + splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed); + trainSentences = getSentences(trainDocs, splitter, ignoreOverlaps); + devSentences = getSentences(devDocs, splitter, ignoreOverlaps); + testSentences = getSentences(testDocs, splitter, ignoreOverlaps); + } } else { List aceSentences = getSentences(docs, splitter, ignoreOverlaps); trainSentences = new ArrayList(); @@ -485,9 +501,17 @@ private static void printDataset(String outputDir, List docs, doubl testSentences = new ArrayList(); splitData(aceSentences, trainSentences, devSentences, testSentences, datasplit, shuffle, shuffleSeed); } - writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); - writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); - writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + + if (splitFolds) { + for (int i = 0; i < foldSentences.size(); i++) { + int fold = i + 1; + writeData(foldSentences.get(i), outputDir, "/fold"+fold+".data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + } + } else { + writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + } } /** @@ -760,6 +784,29 @@ private static void splitData(List aceObjects, List trainObjects, List System.out.println("Test: "+testObjects.size()); } + private static void splitFolds(List aceObjects, List> foldObjects, int foldNum, boolean shuffle, int shuffleSeed){ + int total = aceObjects.size(); + int foldSize = total / foldNum; //last fold may contain more + List tmpObjects = new ArrayList(); + tmpObjects.addAll(aceObjects); + if(shuffle){ + Collections.shuffle(tmpObjects, new Random(shuffleSeed)); + } + for (int f = 0; f < foldNum; f++) { + List foldList = new ArrayList(); + int end = foldSize * (f + 1); + if (f == foldNum - 1 && end < total) end = total; + foldList.addAll(tmpObjects.subList(foldSize * f, end)); + foldObjects.add(foldList); + } + String typeName = tmpObjects.get(0).getClass().getName(); + typeName = typeName.substring(typeName.lastIndexOf(".")+1); + System.out.println("Number of objects ("+typeName+"):"); + System.out.println("Number of folds: "+foldNum); + System.out.println("Fold size: "+foldSize); + System.out.println("Last Fold Size: "+(total - foldSize * foldNum + foldSize)); + } + private static void printStatistics(List sentences){ Map entityCounts = new HashMap(); for(ACESentence sentence: sentences){ From e0992fdfaf1eb7343f8a0e2b561adc94665e7f7d Mon Sep 17 00:00:00 2001 From: allanjie Date: Mon, 5 Jun 2017 21:29:25 +0800 Subject: [PATCH 07/16] temporarily change the use of value and word, may need to change it back --- .../java/justhalf/nlp/reader/acereader/ACEReader.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index bacd2df..9fa0ea3 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -505,7 +505,7 @@ private static void printDataset(String outputDir, List docs, doubl if (splitFolds) { for (int i = 0; i < foldSentences.size(); i++) { int fold = i + 1; - writeData(foldSentences.get(i), outputDir, "/fold"+fold+".data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(foldSentences.get(i), outputDir, "/fold."+fold+".data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); } } else { writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); @@ -850,6 +850,9 @@ private static void writeData(List sentences, String outputDir, Str for(ACESentence sentence: sentences){ if(tokenizer != null){ List tokens = fixTokens(tokenizer.tokenize(sentence.text)); + for (CoreLabel token : tokens) { + token.setValue(escapeBracket(token.value())); + } if(posTagger != null){ posTagger.tagCoreLabels(tokens); } @@ -895,8 +898,7 @@ private static void writeData(List sentences, String outputDir, Str if(stringBuilder.length() > 0){ stringBuilder.append(" "); } - stringBuilder.append(token.value()); - token.setWord(escapeBracket(token.word())); + stringBuilder.append(token.word()); } printer.println(stringBuilder.toString()); if(posTagger != null){ From 4ee323257274e66dffacea9e713aad9f0a8c52f0 Mon Sep 17 00:00:00 2001 From: allanjie Date: Tue, 6 Jun 2017 20:00:15 +0800 Subject: [PATCH 08/16] add the subtype information in ACE reader, note that ACE2004 per has no subtype --- src/main/java/justhalf/nlp/reader/acereader/ACEReader.java | 4 ++-- src/main/java/justhalf/nlp/reader/acereader/ACERelation.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 9fa0ea3..731ff43 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -943,7 +943,7 @@ private static void writeData(List sentences, String outputDir, Str if(stringBuilder.length() > 0){ stringBuilder.append("|"); } - stringBuilder.append(String.format("%s,%s,%s,%s %s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name())); + stringBuilder.append(String.format("%s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); } printer.println(stringBuilder.toString()); } @@ -958,7 +958,7 @@ private static void writeData(List sentences, String outputDir, Str for(ACEEntityMention mention: relation.args){ Span span = findWordSpan(mention.span, tokens); Span headSpan = findWordSpan(mention.headSpan, tokens); - stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name())); + stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); } } printer.println(stringBuilder.toString()); diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java b/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java index 846ef44..6664bda 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java @@ -20,8 +20,8 @@ public static enum ACERelationType implements ACEObjectType { // Only in ACE2004 EMP_ORG("Employee/Membership/Subsidiary", true, false), - OTHER_AFF("PER/ORG Affiliation", true, false), - GPE_AFF("GPE Affiliation", true, false), + OTHER_AFF("PER/ORG-Affiliation", true, false), + GPE_AFF("GPE-Affiliation", true, false), DISC("Discourse", true, false), // Only in ACE2005 From 5bde4ce2fff86723d14bfe90c2e47f5aff9a229f Mon Sep 17 00:00:00 2001 From: allanjie Date: Wed, 28 Jun 2017 21:17:28 +0800 Subject: [PATCH 09/16] add file name member in ACEDocument class --- .gitignore | 2 ++ .../nlp/reader/acereader/ACEDocument.java | 16 +++++++++------- .../justhalf/nlp/reader/acereader/ACEReader.java | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 4c04fd5..ba0f06f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ .project .settings target/ +data/* +.gitignore diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java b/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java index bcf41bc..d8655db 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java @@ -83,6 +83,7 @@ public class ACEDocument implements Serializable{ private static final boolean TEST_STRICT_PARSING = false; private static final long serialVersionUID = -4698300709681532759L; + public String fileName; public String text; public String fullText; public int offset; @@ -103,16 +104,16 @@ public class ACEDocument implements Serializable{ public Map objectsById; public Map> objectMentionsById; - public ACEDocument(String sgmFilename) throws IOException, SAXException { - this(sgmFilename, false); + public ACEDocument(String fileName, String sgmFilename) throws IOException, SAXException { + this(fileName, sgmFilename, false); } - public ACEDocument(String sgmFilename, boolean excludeMetadata) throws IOException, SAXException { - this(sgmFilename, sgmFilename.replace(".sgm", ".apf.xml"), excludeMetadata); + public ACEDocument(String fileName, String sgmFilename, boolean excludeMetadata) throws IOException, SAXException { + this(fileName, sgmFilename, sgmFilename.replace(".sgm", ".apf.xml"), excludeMetadata); } - public ACEDocument(String sgmFilename, String apfFilename, boolean excludeMetadata) throws IOException, SAXException { - this(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(sgmFilename), + public ACEDocument(String fileName, String sgmFilename, String apfFilename, boolean excludeMetadata) throws IOException, SAXException { + this(fileName, IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(sgmFilename), IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(apfFilename), excludeMetadata); } @@ -126,7 +127,7 @@ public ACEDocument(String sgmFilename, String apfFilename, boolean excludeMetada * @throws IOException * @throws SAXException */ - public ACEDocument(InputStream sgmStream, InputStream apfStream, boolean excludeMetadata) throws IOException, SAXException{ + public ACEDocument(String fileName, InputStream sgmStream, InputStream apfStream, boolean excludeMetadata) throws IOException, SAXException{ DOMParser parser = new DOMParser(); String sgmText = IOUtils.slurpInputStream(sgmStream, "UTF-8"); sgmText = sgmText.replaceAll("<(/)?BODY>", "<$1BODY_TEXT>"); @@ -143,6 +144,7 @@ public ACEDocument(InputStream sgmStream, InputStream apfStream, boolean exclude } else { this.text = this.fullText; } + this.fileName = fileName; this.textInLowercase = this.text.equals(this.text.toLowerCase()); this.offset = fullText.indexOf(text); diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 731ff43..6925b10 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -350,7 +350,7 @@ public static void main(String[] args) throws FileNotFoundException{ Map eventTypeMentionCount = new HashMap(); for(File sgmFile: fileList){ try { - ACEDocument doc = new ACEDocument(sgmFile.getAbsolutePath(), excludeMetadata); + ACEDocument doc = new ACEDocument(sgmFile.getName(), sgmFile.getAbsolutePath(), excludeMetadata); docCount++; // printMentions(doc, doc.mentions); From 94e6e906294c81dc3be96a3845bcbf7d0a50a6b3 Mon Sep 17 00:00:00 2001 From: allanjie Date: Wed, 28 Jun 2017 21:18:26 +0800 Subject: [PATCH 10/16] small bug --- src/main/java/justhalf/nlp/reader/acereader/ACEReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 6925b10..0fc8f22 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -1131,7 +1131,7 @@ public static List readDocuments(String ace2004DirName, String ace2 extractDocList(fileList, ace2005DirName, ace2005Domains, "/timex2norm"); } for(File sgmFile: fileList){ - result.add(new ACEDocument(sgmFile.getAbsolutePath())); + result.add(new ACEDocument(sgmFile.getName(), sgmFile.getAbsolutePath())); } return result; } From ddccb4b325400de10bcd2ae1f287b30935c4bfd9 Mon Sep 17 00:00:00 2001 From: allanjie Date: Sat, 1 Jul 2017 08:34:15 +0800 Subject: [PATCH 11/16] change stanford CoreNLP to 3.8 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index b842500..662a977 100644 --- a/pom.xml +++ b/pom.xml @@ -108,7 +108,7 @@ edu.stanford.nlp stanford-corenlp - 3.7.0 + 3.8.0 slf4j-api @@ -123,7 +123,7 @@ edu.stanford.nlp stanford-corenlp - 3.7.0 + 3.8.0 models runtime From b25a446908c01361fa2722187cb7306f7b407bd4 Mon Sep 17 00:00:00 2001 From: allanjie Date: Tue, 25 Jul 2017 18:28:59 +0800 Subject: [PATCH 12/16] print the type only --- .../java/justhalf/nlp/reader/acereader/ACEDocument.java | 1 + src/main/java/justhalf/nlp/reader/acereader/ACEReader.java | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java b/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java index d8655db..d66541f 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java @@ -298,6 +298,7 @@ private ACEEntityMention getMention(Node entityMention, ACEEntity aceEntity){ Span span = getSpan(extentCharseq); String aceText = extentCharseq.getTextContent(); Node head = ((Element)entityMention).getElementsByTagName("HEAD_EXTENT").item(0); + if (head == null) throw new RuntimeException("No head span?"); Node headCharseq = head == null ? null : ((Element)head).getElementsByTagName("CHARSEQ").item(0); Span headSpan = headCharseq == null ? null : getSpan(headCharseq); String aceHeadText = headCharseq == null ? "" : headCharseq.getTextContent(); diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 0fc8f22..436ffd8 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -943,7 +943,8 @@ private static void writeData(List sentences, String outputDir, Str if(stringBuilder.length() > 0){ stringBuilder.append("|"); } - stringBuilder.append(String.format("%s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); + stringBuilder.append(String.format("%s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form)); + //stringBuilder.append(String.format("%s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); } printer.println(stringBuilder.toString()); } @@ -958,7 +959,8 @@ private static void writeData(List sentences, String outputDir, Str for(ACEEntityMention mention: relation.args){ Span span = findWordSpan(mention.span, tokens); Span headSpan = findWordSpan(mention.headSpan, tokens); - stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); + stringBuilder.append(String.format(" %s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form)); + //stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); } } printer.println(stringBuilder.toString()); From 79d72c14d077d67e6e601c0d75f57114074e2970 Mon Sep 17 00:00:00 2001 From: allanjie Date: Mon, 7 Aug 2017 16:48:13 +0800 Subject: [PATCH 13/16] print the relation extent span --- src/main/java/justhalf/nlp/reader/acereader/ACEReader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 436ffd8..12c08d0 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -955,7 +955,9 @@ private static void writeData(List sentences, String outputDir, Str if(stringBuilder.length() > 0){ stringBuilder.append("|"); } - stringBuilder.append(relation.relation.type() + "::" + relation.relation.subtype()); + Span relSpan = findWordSpan(relation.span, tokens); + stringBuilder.append(relation.relation.type() + "::" + relSpan.start + ","+relSpan.end); + //stringBuilder.append(relation.relation.type() + "::" + relation.relation.subtype()); for(ACEEntityMention mention: relation.args){ Span span = findWordSpan(mention.span, tokens); Span headSpan = findWordSpan(mention.headSpan, tokens); From d3e5b805ce3dd42bd4056657db5d4a32b76f9803 Mon Sep 17 00:00:00 2001 From: allanjie Date: Mon, 7 Aug 2017 18:42:49 +0800 Subject: [PATCH 14/16] add split options according to the file name --- .../nlp/reader/acereader/ACEReader.java | 94 +++++++++++++++++-- 1 file changed, 86 insertions(+), 8 deletions(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 12c08d0..6c70138 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -5,9 +5,12 @@ import static justhalf.nlp.reader.acereader.ACEDocument.unescape; +import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; @@ -66,6 +69,8 @@ public class ACEReader { public static void main(String[] args) throws FileNotFoundException{ String ace2004DirName = null; String ace2005DirName = null; + String ace2004SplitFolder = null; + String ace2005SplitFolder = null; HashSet ace2004Domains = new LinkedHashSet(ACE2004_DOMAINS); HashSet ace2005Domains = new LinkedHashSet(ACE2005_DOMAINS); @@ -92,7 +97,8 @@ public static void main(String[] args) throws FileNotFoundException{ boolean ignoreOverlaps = false; boolean useBILOU = false; boolean splitByDocument = true; - boolean splitFolds = false; + boolean ace2004SplitFolds = false; + boolean ace2005SplitFolds = false; boolean shuffle = false; boolean excludeMetadata = false; int shuffleSeed = 31; @@ -249,11 +255,24 @@ public static void main(String[] args) throws FileNotFoundException{ splitByDocument = false; argIndex += 1; break; - case "-splitFolds": - splitFolds = true; //note: careful about the split. + case "-ace2004SplitFolds": + ace2004SplitFolds = true; //note: careful about the split. foldNum = Integer.valueOf(args[argIndex + 1]); argIndex += 2; break; + case "-ace2005SplitFolds": + ace2004SplitFolds = true; //note: careful about the split. + foldNum = Integer.valueOf(args[argIndex + 1]); + argIndex += 2; + break; + case "-ace2004SplitFolder": + ace2004SplitFolder = args[argIndex + 1]; + argIndex += 2; + break; + case "-ace2005SplitFolder": + ace2005SplitFolder = args[argIndex + 1]; + argIndex += 2; + break; case "-shuffle": shuffle = true; argIndex += 1; @@ -459,13 +478,13 @@ public static void main(String[] args) throws FileNotFoundException{ System.out.println("Printing ACE2004 dataset to "+ace2004OutputDir+"/{train,dev,test}.data"); printDataset(ace2004OutputDir, ace2004Docs, datasplit, foldNum, printEntities, printRelations, (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, - splitter,toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, splitFolds, shuffle, shuffleSeed); + splitter,toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, ace2004SplitFolds, ace2004SplitFolder, shuffle, shuffleSeed); } if(ace2005Docs.size() > 0){ System.out.println("Printing ACE2005 dataset to "+ace2005OutputDir+"/{train,dev,test}.data"); printDataset(ace2005OutputDir, ace2005Docs, datasplit, foldNum, printEntities, printRelations, (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, - splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, splitFolds, shuffle, shuffleSeed); + splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, ace2005SplitFolds, ace2005SplitFolder, shuffle, shuffleSeed); } } } @@ -473,7 +492,7 @@ public static void main(String[] args) throws FileNotFoundException{ private static void printDataset(String outputDir, List docs, double[] datasplit, int foldNum, boolean printEntities, boolean printRelations, Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, SentenceParser parser, SentenceSplitter splitter, boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, - boolean splitByDocument, boolean splitFolds, boolean shuffle, int shuffleSeed) throws FileNotFoundException { + boolean splitByDocument, boolean splitFolds, String splitFolder, boolean shuffle, int shuffleSeed) throws FileNotFoundException { List trainSentences = new ArrayList(); List devSentences = new ArrayList(); List testSentences = new ArrayList(); @@ -481,7 +500,12 @@ private static void printDataset(String outputDir, List docs, doubl if(splitByDocument){ if (splitFolds) { List> foldDocs = new ArrayList<>(); - splitFolds(docs, foldDocs, foldNum, shuffle, shuffleSeed); + if (splitFolder == null) { + splitFolds(docs, foldDocs, foldNum, shuffle, shuffleSeed); + } else { + List> foldFileNames = readSplits(splitFolder); + splitFolds(docs, foldDocs, foldFileNames); + } for (int i = 0; i < foldDocs.size(); i++) { foldSentences.add(getSentences(foldDocs.get(i), splitter, ignoreOverlaps)); } @@ -489,7 +513,16 @@ private static void printDataset(String outputDir, List docs, doubl List trainDocs = new ArrayList(); List devDocs = new ArrayList(); List testDocs = new ArrayList(); - splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed); + if (splitFolder == null) { + splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed); + } else { + List> foldFileNames = readSplits(splitFolder); + List> foldDocs = new ArrayList<>(); + splitFolds(docs, foldDocs, foldFileNames); + trainDocs = foldDocs.get(0); + devDocs = foldDocs.get(1); + testDocs = foldDocs.get(2); + } trainSentences = getSentences(trainDocs, splitter, ignoreOverlaps); devSentences = getSentences(devDocs, splitter, ignoreOverlaps); testSentences = getSentences(testDocs, splitter, ignoreOverlaps); @@ -759,6 +792,27 @@ private static List fixTokens(List tokens){ return result; } + private static List> readSplits (String splitFile) { + List> splits = new ArrayList<>(); + try { + File folder = new File(splitFile); + File[] files = folder.listFiles(); + for(File file : files) { + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file.getAbsolutePath()),"UTF-8")); + String line = null; + List split = new ArrayList<>(); + while ((line = br.readLine()) != null) { + split.add(line); + } + splits.add(split); + br.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + return splits; + } + private static void splitData(List aceObjects, List trainObjects, List devObjects, List testObjects, double[] datasplit, boolean shuffle, int shuffleSeed){ int total = aceObjects.size(); @@ -807,6 +861,30 @@ private static void splitFolds(List aceObjects, List> foldObjects System.out.println("Last Fold Size: "+(total - foldSize * foldNum + foldSize)); } + private static void splitFolds(List aceObjects, List> foldObjects, List> foldFileNames){ + Map name2Obj = new HashMap<>(); + for(ACEDocument doc : aceObjects) { + String trimedFileName = doc.fileName.replace(".sgm", ""); + name2Obj.put(trimedFileName, doc); + } + if (name2Obj.size() != aceObjects.size()) throw new RuntimeException("not same size?"); + + for (int f = 0; f < foldFileNames.size(); f++) { + List foldList = new ArrayList(); + List fileNames = foldFileNames.get(f); + for(String fileName : fileNames) { + ACEDocument doc = name2Obj.get(fileName); + foldList.add(doc); + } + foldObjects.add(foldList); + System.out.println("Fold "+(f+1)+" size: "+fileNames.size()); + } + String typeName = aceObjects.get(0).getClass().getName(); + typeName = typeName.substring(typeName.lastIndexOf(".")+1); + System.out.println("Number of objects ("+typeName+"):"); + System.out.println("Number of folds: "+foldFileNames.size()); + } + private static void printStatistics(List sentences){ Map entityCounts = new HashMap(); for(ACESentence sentence: sentences){ From 43e37048603d1f38778bd4bff4c9b78cac7ba90b Mon Sep 17 00:00:00 2001 From: allanjie Date: Mon, 7 Aug 2017 19:03:01 +0800 Subject: [PATCH 15/16] fix train dev test order --- .../nlp/reader/acereader/ACEReader.java | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 6c70138..5d2407d 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -503,7 +503,7 @@ private static void printDataset(String outputDir, List docs, doubl if (splitFolder == null) { splitFolds(docs, foldDocs, foldNum, shuffle, shuffleSeed); } else { - List> foldFileNames = readSplits(splitFolder); + List> foldFileNames = readSplits(splitFolder, true); splitFolds(docs, foldDocs, foldFileNames); } for (int i = 0; i < foldDocs.size(); i++) { @@ -516,7 +516,7 @@ private static void printDataset(String outputDir, List docs, doubl if (splitFolder == null) { splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed); } else { - List> foldFileNames = readSplits(splitFolder); + List> foldFileNames = readSplits(splitFolder, false); List> foldDocs = new ArrayList<>(); splitFolds(docs, foldDocs, foldFileNames); trainDocs = foldDocs.get(0); @@ -792,11 +792,16 @@ private static List fixTokens(List tokens){ return result; } - private static List> readSplits (String splitFile) { + private static List> readSplits (String splitFile, boolean isFolds) { List> splits = new ArrayList<>(); try { File folder = new File(splitFile); File[] files = folder.listFiles(); + if (!isFolds) { + for(File file : files) { + if (!file.isDirectory()) splits.add(new ArrayList<>()); + } + } for(File file : files) { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file.getAbsolutePath()),"UTF-8")); String line = null; @@ -804,7 +809,20 @@ private static List> readSplits (String splitFile) { while ((line = br.readLine()) != null) { split.add(line); } - splits.add(split); + if (!isFolds) { + if (file.getName().equals("train")) { + splits.set(0, split); + } else if (file.getName().equals("dev")) { + splits.set(1, split); + } else if (file.getName().equals("test")) { + splits.set(2, split); + } else { + br.close(); + throw new RuntimeException("unknow:" + file.getName()); + } + } else { + splits.add(split); + } br.close(); } } catch (IOException e) { @@ -874,6 +892,7 @@ private static void splitFolds(List aceObjects, List fileNames = foldFileNames.get(f); for(String fileName : fileNames) { ACEDocument doc = name2Obj.get(fileName); + if (doc==null) throw new RuntimeException("null doc?"); foldList.add(doc); } foldObjects.add(foldList); From 38eaabd60b7032e6efbde8622797c46da46a06f2 Mon Sep 17 00:00:00 2001 From: Allan Date: Tue, 1 May 2018 21:05:24 +0800 Subject: [PATCH 16/16] add some code for conll format, and conll format only use head span --- .../nlp/reader/acereader/ACEReader.java | 62 ++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index 5d2407d..6719c97 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -970,13 +970,17 @@ private static void writeData(List sentences, String outputDir, Str parseTree = parser.parseCoreLabel(tokens); } if(toCoNLL){ - List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU); +// List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU); //use full span + List outputTokens = headSpansToLabels(sentence.entities, tokens, useBILOU); //use head span. if(posTagger != null){ for(int i=0; i sentences, String outputDir, Str printer.println(String.format("%s\t%s",tokens.get(i).value(), annotations)); } } + if(printRelations){ + StringBuilder stringBuilder = new StringBuilder(); + for(ACERelationMention relation: sentence.relations){ + if(stringBuilder.length() > 0){ + stringBuilder.append("|"); + } +// Span relSpan = findWordSpan(relation.span, tokens); + stringBuilder.append(relation.relation.type()); + //stringBuilder.append(relation.relation.type() + "::" + relation.relation.subtype()); + for(ACEEntityMention mention: relation.args){ +// Span span = findWordSpan(mention.span, tokens); + Span headSpan = findWordSpan(mention.headSpan, tokens); + stringBuilder.append(String.format(" %s,%s %s", headSpan.start, headSpan.end, mention.label.form)); + //stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); + } + } + printer.println(stringBuilder.toString()); + } printer.println(); } else { StringBuilder stringBuilder = new StringBuilder(); @@ -1145,6 +1167,44 @@ private static List spansToLabels(List mentions, Li return Arrays.asList(result); } + private static List headSpansToLabels(List mentions, List tokens, boolean useBILOU){ + WordLabel[] result = new WordLabel[tokens.size()]; + Arrays.fill(result, null); + for(ACEEntityMention mention: mentions){ + Span span = findWordSpan(mention.headSpan, tokens); + String type = mention.label.form; + for(int i=span.start; i