diff --git a/.gitignore b/.gitignore index 4c04fd5..ba0f06f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ .project .settings target/ +data/* +.gitignore diff --git a/pom.xml b/pom.xml index 6393d1f..662a977 100644 --- a/pom.xml +++ b/pom.xml @@ -108,7 +108,7 @@ edu.stanford.nlp stanford-corenlp - 3.6.0 + 3.8.0 slf4j-api @@ -123,7 +123,7 @@ edu.stanford.nlp stanford-corenlp - 3.6.0 + 3.8.0 models runtime @@ -330,6 +330,8 @@ edu.stanford.nlp:stanford-corenlp:jar:models:* **/pos-tagger/**/* + **/parser/**/* + **/lexparser/**/* diff --git a/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java b/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java index 7b10cd8..559ac49 100644 --- a/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java +++ b/src/main/java/justhalf/nlp/depparser/StanfordDepParser.java @@ -7,6 +7,7 @@ import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.parser.nndep.DependencyParser; import edu.stanford.nlp.trees.GrammaticalStructure; +import edu.stanford.nlp.trees.GrammaticalStructure.Extras; import edu.stanford.nlp.trees.TypedDependency; /** @@ -53,7 +54,7 @@ public boolean isThreadSafe() { public List parse(List sentence) { check(sentence); GrammaticalStructure structure = dependencyParser.predict(sentence); - return structure.typedDependenciesCCprocessed(); + return structure.typedDependencies(Extras.NONE); } private void check(List sentence){ diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java b/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java index bcf41bc..d66541f 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEDocument.java @@ -83,6 +83,7 @@ public class ACEDocument implements Serializable{ private static final boolean TEST_STRICT_PARSING = false; private static final long serialVersionUID = -4698300709681532759L; + public String fileName; public String text; public String fullText; public int offset; @@ -103,16 +104,16 @@ public class ACEDocument implements Serializable{ public Map objectsById; public Map> objectMentionsById; - public ACEDocument(String sgmFilename) throws IOException, SAXException { - this(sgmFilename, false); + public ACEDocument(String fileName, String sgmFilename) throws IOException, SAXException { + this(fileName, sgmFilename, false); } - public ACEDocument(String sgmFilename, boolean excludeMetadata) throws IOException, SAXException { - this(sgmFilename, sgmFilename.replace(".sgm", ".apf.xml"), excludeMetadata); + public ACEDocument(String fileName, String sgmFilename, boolean excludeMetadata) throws IOException, SAXException { + this(fileName, sgmFilename, sgmFilename.replace(".sgm", ".apf.xml"), excludeMetadata); } - public ACEDocument(String sgmFilename, String apfFilename, boolean excludeMetadata) throws IOException, SAXException { - this(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(sgmFilename), + public ACEDocument(String fileName, String sgmFilename, String apfFilename, boolean excludeMetadata) throws IOException, SAXException { + this(fileName, IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(sgmFilename), IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(apfFilename), excludeMetadata); } @@ -126,7 +127,7 @@ public ACEDocument(String sgmFilename, String apfFilename, boolean excludeMetada * @throws IOException * @throws SAXException */ - public ACEDocument(InputStream sgmStream, InputStream apfStream, boolean excludeMetadata) throws IOException, SAXException{ + public ACEDocument(String fileName, InputStream sgmStream, InputStream apfStream, boolean excludeMetadata) throws IOException, SAXException{ DOMParser parser = new DOMParser(); String sgmText = IOUtils.slurpInputStream(sgmStream, "UTF-8"); sgmText = sgmText.replaceAll("<(/)?BODY>", "<$1BODY_TEXT>"); @@ -143,6 +144,7 @@ public ACEDocument(InputStream sgmStream, InputStream apfStream, boolean exclude } else { this.text = this.fullText; } + this.fileName = fileName; this.textInLowercase = this.text.equals(this.text.toLowerCase()); this.offset = fullText.indexOf(text); @@ -296,6 +298,7 @@ private ACEEntityMention getMention(Node entityMention, ACEEntity aceEntity){ Span span = getSpan(extentCharseq); String aceText = extentCharseq.getTextContent(); Node head = ((Element)entityMention).getElementsByTagName("HEAD_EXTENT").item(0); + if (head == null) throw new RuntimeException("No head span?"); Node headCharseq = head == null ? null : ((Element)head).getElementsByTagName("CHARSEQ").item(0); Span headSpan = headCharseq == null ? null : getSpan(headCharseq); String aceHeadText = headCharseq == null ? "" : headCharseq.getTextContent(); diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java index cd6d14c..6719c97 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACEReader.java @@ -5,9 +5,12 @@ import static justhalf.nlp.reader.acereader.ACEDocument.unescape; +import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; @@ -25,7 +28,11 @@ import org.xml.sax.SAXException; import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.util.StringUtils; +import justhalf.nlp.depparser.DepParser; +import justhalf.nlp.depparser.StanfordDepParser; import justhalf.nlp.postagger.POSTagger; import justhalf.nlp.postagger.StanfordPOSTagger; import justhalf.nlp.reader.acereader.ACEEntity.ACEEntitySubType; @@ -36,6 +43,8 @@ import justhalf.nlp.reader.acereader.ACERelation.ACERelationType; import justhalf.nlp.reader.acereader.ACEValue.ACEValueSubType; import justhalf.nlp.reader.acereader.ACEValue.ACEValueType; +import justhalf.nlp.sentenceparser.SentenceParser; +import justhalf.nlp.sentenceparser.StanfordSentenceParser; import justhalf.nlp.sentencesplitter.SentenceSplitter; import justhalf.nlp.sentencesplitter.StanfordSentenceSplitter; import justhalf.nlp.tokenizer.RegexTokenizer; @@ -60,10 +69,13 @@ public class ACEReader { public static void main(String[] args) throws FileNotFoundException{ String ace2004DirName = null; String ace2005DirName = null; + String ace2004SplitFolder = null; + String ace2005SplitFolder = null; HashSet ace2004Domains = new LinkedHashSet(ACE2004_DOMAINS); HashSet ace2005Domains = new LinkedHashSet(ACE2005_DOMAINS); double[] datasplit = null; + int foldNum = 0; //use when split fold boolean print = false; boolean printEntities = false; boolean printRelations = false; @@ -72,14 +84,21 @@ public static void main(String[] args) throws FileNotFoundException{ boolean tokenize = false; boolean posTag = false; + boolean parse = false; + boolean dep = false; Tokenizer tokenizer = null; POSTagger posTagger = null; SentenceSplitter splitter = null; + SentenceParser parser = null; + DepParser depParser = null; + boolean toCoNLL = false; boolean ignoreOverlaps = false; boolean useBILOU = false; boolean splitByDocument = true; + boolean ace2004SplitFolds = false; + boolean ace2005SplitFolds = false; boolean shuffle = false; boolean excludeMetadata = false; int shuffleSeed = 31; @@ -181,6 +200,33 @@ public static void main(String[] args) throws FileNotFoundException{ } argIndex += 2; break; + case "-parser": + parse = true; + switch(args[argIndex+1]){ + case "stanford": + parser = new StanfordSentenceParser(); + break; + default: + System.out.println("Unrecognized sentence parser \""+args[argIndex+1]+"\", using stanford."); + parser = new StanfordSentenceParser(); + break; + } + argIndex += 2; + break; + case "-depparser": + dep = true; + switch(args[argIndex+1]){ + case "stanford": + //by default it's UD parser + depParser = new StanfordDepParser(); + break; + default: + System.out.println("Unrecognized dependency parser \""+args[argIndex+1]+"\", using stanford."); + depParser = new StanfordDepParser(); + break; + } + argIndex += 2; + break; case "-splitter": switch(args[argIndex+1]){ case "stanford": @@ -209,6 +255,24 @@ public static void main(String[] args) throws FileNotFoundException{ splitByDocument = false; argIndex += 1; break; + case "-ace2004SplitFolds": + ace2004SplitFolds = true; //note: careful about the split. + foldNum = Integer.valueOf(args[argIndex + 1]); + argIndex += 2; + break; + case "-ace2005SplitFolds": + ace2004SplitFolds = true; //note: careful about the split. + foldNum = Integer.valueOf(args[argIndex + 1]); + argIndex += 2; + break; + case "-ace2004SplitFolder": + ace2004SplitFolder = args[argIndex + 1]; + argIndex += 2; + break; + case "-ace2005SplitFolder": + ace2005SplitFolder = args[argIndex + 1]; + argIndex += 2; + break; case "-shuffle": shuffle = true; argIndex += 1; @@ -242,7 +306,7 @@ public static void main(String[] args) throws FileNotFoundException{ printHelp("Please specify the output directory for ACE2005."); System.exit(0); } - if(datasplit == null){ + if(datasplit == null && foldNum == 0){ printHelp("Please specify the datasplit with -dataSplit option."); System.exit(0); } @@ -305,7 +369,7 @@ public static void main(String[] args) throws FileNotFoundException{ Map eventTypeMentionCount = new HashMap(); for(File sgmFile: fileList){ try { - ACEDocument doc = new ACEDocument(sgmFile.getAbsolutePath(), excludeMetadata); + ACEDocument doc = new ACEDocument(sgmFile.getName(), sgmFile.getAbsolutePath(), excludeMetadata); docCount++; // printMentions(doc, doc.mentions); @@ -412,34 +476,57 @@ public static void main(String[] args) throws FileNotFoundException{ if(print){ if(ace2004Docs.size() > 0){ System.out.println("Printing ACE2004 dataset to "+ace2004OutputDir+"/{train,dev,test}.data"); - printDataset(ace2004OutputDir, ace2004Docs, datasplit, printEntities, printRelations, - (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, splitter, - toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); + printDataset(ace2004OutputDir, ace2004Docs, datasplit, foldNum, printEntities, printRelations, + (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, + splitter,toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, ace2004SplitFolds, ace2004SplitFolder, shuffle, shuffleSeed); } if(ace2005Docs.size() > 0){ System.out.println("Printing ACE2005 dataset to "+ace2005OutputDir+"/{train,dev,test}.data"); - printDataset(ace2005OutputDir, ace2005Docs, datasplit, printEntities, printRelations, - (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, - splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed); + printDataset(ace2005OutputDir, ace2005Docs, datasplit, foldNum, printEntities, printRelations, + (tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, dep? depParser : null, parse ? parser : null, + splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, ace2005SplitFolds, ace2005SplitFolder, shuffle, shuffleSeed); } } } - private static void printDataset(String outputDir, List docs, double[] datasplit, - boolean printEntities, boolean printRelations, Tokenizer tokenizer, POSTagger posTagger, - SentenceSplitter splitter, boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, - boolean splitByDocument, boolean shuffle, int shuffleSeed) throws FileNotFoundException { + private static void printDataset(String outputDir, List docs, double[] datasplit, int foldNum, + boolean printEntities, boolean printRelations, Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, + SentenceParser parser, SentenceSplitter splitter, boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, + boolean splitByDocument, boolean splitFolds, String splitFolder, boolean shuffle, int shuffleSeed) throws FileNotFoundException { List trainSentences = new ArrayList(); List devSentences = new ArrayList(); List testSentences = new ArrayList(); + List> foldSentences = new ArrayList<>(); if(splitByDocument){ - List trainDocs = new ArrayList(); - List devDocs = new ArrayList(); - List testDocs = new ArrayList(); - splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed); - trainSentences = getSentences(trainDocs, splitter, ignoreOverlaps); - devSentences = getSentences(devDocs, splitter, ignoreOverlaps); - testSentences = getSentences(testDocs, splitter, ignoreOverlaps); + if (splitFolds) { + List> foldDocs = new ArrayList<>(); + if (splitFolder == null) { + splitFolds(docs, foldDocs, foldNum, shuffle, shuffleSeed); + } else { + List> foldFileNames = readSplits(splitFolder, true); + splitFolds(docs, foldDocs, foldFileNames); + } + for (int i = 0; i < foldDocs.size(); i++) { + foldSentences.add(getSentences(foldDocs.get(i), splitter, ignoreOverlaps)); + } + } else { + List trainDocs = new ArrayList(); + List devDocs = new ArrayList(); + List testDocs = new ArrayList(); + if (splitFolder == null) { + splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed); + } else { + List> foldFileNames = readSplits(splitFolder, false); + List> foldDocs = new ArrayList<>(); + splitFolds(docs, foldDocs, foldFileNames); + trainDocs = foldDocs.get(0); + devDocs = foldDocs.get(1); + testDocs = foldDocs.get(2); + } + trainSentences = getSentences(trainDocs, splitter, ignoreOverlaps); + devSentences = getSentences(devDocs, splitter, ignoreOverlaps); + testSentences = getSentences(testDocs, splitter, ignoreOverlaps); + } } else { List aceSentences = getSentences(docs, splitter, ignoreOverlaps); trainSentences = new ArrayList(); @@ -447,9 +534,17 @@ private static void printDataset(String outputDir, List docs, doubl testSentences = new ArrayList(); splitData(aceSentences, trainSentences, devSentences, testSentences, datasplit, shuffle, shuffleSeed); } - writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, printEntities, printRelations, toCoNLL, useBILOU); - writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, printEntities, printRelations, toCoNLL, useBILOU); - writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, printEntities, printRelations, toCoNLL, useBILOU); + + if (splitFolds) { + for (int i = 0; i < foldSentences.size(); i++) { + int fold = i + 1; + writeData(foldSentences.get(i), outputDir, "/fold."+fold+".data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + } + } else { + writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, depParser, parser, printEntities, printRelations, toCoNLL, useBILOU); + } } /** @@ -697,6 +792,45 @@ private static List fixTokens(List tokens){ return result; } + private static List> readSplits (String splitFile, boolean isFolds) { + List> splits = new ArrayList<>(); + try { + File folder = new File(splitFile); + File[] files = folder.listFiles(); + if (!isFolds) { + for(File file : files) { + if (!file.isDirectory()) splits.add(new ArrayList<>()); + } + } + for(File file : files) { + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file.getAbsolutePath()),"UTF-8")); + String line = null; + List split = new ArrayList<>(); + while ((line = br.readLine()) != null) { + split.add(line); + } + if (!isFolds) { + if (file.getName().equals("train")) { + splits.set(0, split); + } else if (file.getName().equals("dev")) { + splits.set(1, split); + } else if (file.getName().equals("test")) { + splits.set(2, split); + } else { + br.close(); + throw new RuntimeException("unknow:" + file.getName()); + } + } else { + splits.add(split); + } + br.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + return splits; + } + private static void splitData(List aceObjects, List trainObjects, List devObjects, List testObjects, double[] datasplit, boolean shuffle, int shuffleSeed){ int total = aceObjects.size(); @@ -722,6 +856,54 @@ private static void splitData(List aceObjects, List trainObjects, List System.out.println("Test: "+testObjects.size()); } + private static void splitFolds(List aceObjects, List> foldObjects, int foldNum, boolean shuffle, int shuffleSeed){ + int total = aceObjects.size(); + int foldSize = total / foldNum; //last fold may contain more + List tmpObjects = new ArrayList(); + tmpObjects.addAll(aceObjects); + if(shuffle){ + Collections.shuffle(tmpObjects, new Random(shuffleSeed)); + } + for (int f = 0; f < foldNum; f++) { + List foldList = new ArrayList(); + int end = foldSize * (f + 1); + if (f == foldNum - 1 && end < total) end = total; + foldList.addAll(tmpObjects.subList(foldSize * f, end)); + foldObjects.add(foldList); + } + String typeName = tmpObjects.get(0).getClass().getName(); + typeName = typeName.substring(typeName.lastIndexOf(".")+1); + System.out.println("Number of objects ("+typeName+"):"); + System.out.println("Number of folds: "+foldNum); + System.out.println("Fold size: "+foldSize); + System.out.println("Last Fold Size: "+(total - foldSize * foldNum + foldSize)); + } + + private static void splitFolds(List aceObjects, List> foldObjects, List> foldFileNames){ + Map name2Obj = new HashMap<>(); + for(ACEDocument doc : aceObjects) { + String trimedFileName = doc.fileName.replace(".sgm", ""); + name2Obj.put(trimedFileName, doc); + } + if (name2Obj.size() != aceObjects.size()) throw new RuntimeException("not same size?"); + + for (int f = 0; f < foldFileNames.size(); f++) { + List foldList = new ArrayList(); + List fileNames = foldFileNames.get(f); + for(String fileName : fileNames) { + ACEDocument doc = name2Obj.get(fileName); + if (doc==null) throw new RuntimeException("null doc?"); + foldList.add(doc); + } + foldObjects.add(foldList); + System.out.println("Fold "+(f+1)+" size: "+fileNames.size()); + } + String typeName = aceObjects.get(0).getClass().getName(); + typeName = typeName.substring(typeName.lastIndexOf(".")+1); + System.out.println("Number of objects ("+typeName+"):"); + System.out.println("Number of folds: "+foldFileNames.size()); + } + private static void printStatistics(List sentences){ Map entityCounts = new HashMap(); for(ACESentence sentence: sentences){ @@ -759,23 +941,46 @@ private static > List sorted(Collection coll){ } private static void writeData(List sentences, String outputDir, String name, - Tokenizer tokenizer, POSTagger posTagger, boolean printEntities, boolean printRelations, + Tokenizer tokenizer, POSTagger posTagger, DepParser depParser, SentenceParser parser, boolean printEntities, boolean printRelations, boolean toCoNLL, boolean useBILOU) throws FileNotFoundException{ PrintWriter printer = new PrintWriter(new File(outputDir+name)); for(ACESentence sentence: sentences){ if(tokenizer != null){ List tokens = fixTokens(tokenizer.tokenize(sentence.text)); + for (CoreLabel token : tokens) { + token.setValue(escapeBracket(token.value())); + } if(posTagger != null){ posTagger.tagCoreLabels(tokens); } + int[] heads = null; + String[] depLabels = null; + if (depParser != null && depParser instanceof StanfordDepParser) { + heads = new int[tokens.size()]; + depLabels = new String[tokens.size()]; + List deps = depParser.parse(tokens); + for (int i = 0; i < deps.size(); i++) { + TypedDependency typedDep = deps.get(i); + heads[typedDep.dep().index() - 1] = typedDep.gov().index() - 1; + depLabels[typedDep.dep().index() - 1] = typedDep.reln().getShortName(); + } + } + Tree parseTree = null; + if (parser != null) { + parseTree = parser.parseCoreLabel(tokens); + } if(toCoNLL){ - List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU); +// List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU); //use full span + List outputTokens = headSpansToLabels(sentence.entities, tokens, useBILOU); //use head span. if(posTagger != null){ for(int i=0; i sentences, String outputDir, Str printer.println(String.format("%s\t%s",tokens.get(i).value(), annotations)); } } + if(printRelations){ + StringBuilder stringBuilder = new StringBuilder(); + for(ACERelationMention relation: sentence.relations){ + if(stringBuilder.length() > 0){ + stringBuilder.append("|"); + } +// Span relSpan = findWordSpan(relation.span, tokens); + stringBuilder.append(relation.relation.type()); + //stringBuilder.append(relation.relation.type() + "::" + relation.relation.subtype()); + for(ACEEntityMention mention: relation.args){ +// Span span = findWordSpan(mention.span, tokens); + Span headSpan = findWordSpan(mention.headSpan, tokens); + stringBuilder.append(String.format(" %s,%s %s", headSpan.start, headSpan.end, mention.label.form)); + //stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); + } + } + printer.println(stringBuilder.toString()); + } printer.println(); } else { StringBuilder stringBuilder = new StringBuilder(); @@ -794,8 +1017,7 @@ private static void writeData(List sentences, String outputDir, Str if(stringBuilder.length() > 0){ stringBuilder.append(" "); } - stringBuilder.append(token.value()); - token.setWord(escapeBracket(token.word())); + stringBuilder.append(token.word()); } printer.println(stringBuilder.toString()); if(posTagger != null){ @@ -808,6 +1030,30 @@ private static void writeData(List sentences, String outputDir, Str } printer.println(stringBuilder.toString()); } + if (posTagger != null && parser != null) { + //Note: the line above parser string must contain something. otherwise the penn reader later have error. + stringBuilder = new StringBuilder(); + stringBuilder = parseTree.toStringBuilder(stringBuilder); + printer.println(stringBuilder.toString()); + } + if (heads != null && depLabels !=null) { + stringBuilder = new StringBuilder(); + for (int i = 0; i < heads.length; i++) { + if(stringBuilder.length() > 0){ + stringBuilder.append(" "); + } + stringBuilder.append(heads[i]+""); + } + printer.println(stringBuilder.toString()); + stringBuilder = new StringBuilder(); + for (int i = 0; i < depLabels.length; i++) { + if(stringBuilder.length() > 0){ + stringBuilder.append(" "); + } + stringBuilder.append(depLabels[i]); + } + printer.println(stringBuilder.toString()); + } if(printEntities){ stringBuilder = new StringBuilder(); for(ACEEntityMention mention: sentence.entities){ @@ -817,6 +1063,7 @@ private static void writeData(List sentences, String outputDir, Str stringBuilder.append("|"); } stringBuilder.append(String.format("%s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form)); + //stringBuilder.append(String.format("%s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); } printer.println(stringBuilder.toString()); } @@ -827,16 +1074,18 @@ private static void writeData(List sentences, String outputDir, Str if(stringBuilder.length() > 0){ stringBuilder.append("|"); } - stringBuilder.append(relation.relation.type() + "::" + relation.relation.subtype()); + Span relSpan = findWordSpan(relation.span, tokens); + stringBuilder.append(relation.relation.type() + "::" + relSpan.start + ","+relSpan.end); + //stringBuilder.append(relation.relation.type() + "::" + relation.relation.subtype()); for(ACEEntityMention mention: relation.args){ Span span = findWordSpan(mention.span, tokens); Span headSpan = findWordSpan(mention.headSpan, tokens); stringBuilder.append(String.format(" %s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form)); + //stringBuilder.append(String.format(" %s,%s,%s,%s %s,%s,%s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form, mention.mentionType.name(), mention.entity.subtype())); } } printer.println(stringBuilder.toString()); } - printer.println(); } } else { @@ -918,6 +1167,44 @@ private static List spansToLabels(List mentions, Li return Arrays.asList(result); } + private static List headSpansToLabels(List mentions, List tokens, boolean useBILOU){ + WordLabel[] result = new WordLabel[tokens.size()]; + Arrays.fill(result, null); + for(ACEEntityMention mention: mentions){ + Span span = findWordSpan(mention.headSpan, tokens); + String type = mention.label.form; + for(int i=span.start; i readDocuments(String ace2004DirName, String ace2 extractDocList(fileList, ace2005DirName, ace2005Domains, "/timex2norm"); } for(File sgmFile: fileList){ - result.add(new ACEDocument(sgmFile.getAbsolutePath())); + result.add(new ACEDocument(sgmFile.getName(), sgmFile.getAbsolutePath())); } return result; } diff --git a/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java b/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java index 846ef44..6664bda 100644 --- a/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java +++ b/src/main/java/justhalf/nlp/reader/acereader/ACERelation.java @@ -20,8 +20,8 @@ public static enum ACERelationType implements ACEObjectType { // Only in ACE2004 EMP_ORG("Employee/Membership/Subsidiary", true, false), - OTHER_AFF("PER/ORG Affiliation", true, false), - GPE_AFF("GPE Affiliation", true, false), + OTHER_AFF("PER/ORG-Affiliation", true, false), + GPE_AFF("GPE-Affiliation", true, false), DISC("Discourse", true, false), // Only in ACE2005 diff --git a/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java b/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java index 1288784..964a976 100644 --- a/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java +++ b/src/main/java/justhalf/nlp/sentenceparser/SentenceParser.java @@ -2,6 +2,7 @@ import java.util.List; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.trees.Tree; import justhalf.nlp.NLPInterface; @@ -27,4 +28,13 @@ public interface SentenceParser extends NLPInterface{ * The parsed sentence */ public Tree parse(List sentence); + + /** + * Parse pretagged sentence, returning a Tree. + * @param sentence + * The sentence to be parsed as a list of tokens and also labels + * @return the parsed tree + */ + public Tree parseCoreLabel(List sentence); + } diff --git a/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java b/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java index fe22f29..cd784ab 100644 --- a/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java +++ b/src/main/java/justhalf/nlp/sentenceparser/StanfordSentenceParser.java @@ -2,6 +2,7 @@ import java.util.List; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.trees.Tree; @@ -29,6 +30,11 @@ public Tree parse(List sentence) { return parser.parseStrings(sentence); } + @Override + public Tree parseCoreLabel(List sentence) { + return parser.parse(sentence); + } + @Override public boolean isThreadSafe(){ return true;