diff --git a/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java b/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java index ecd6005a4..9334be6c5 100644 --- a/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java +++ b/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java @@ -8,6 +8,7 @@ import org.clulab.utils.JavaUtils; import java.util.Iterator; +import scala.collection.Seq; public class ProcessorsJavaExample { public static void main(String [] args) throws Exception { @@ -20,25 +21,25 @@ public static void main(String [] args) throws Exception { // You are basically done. The rest of this code simply prints out the annotations. // Let's print the sentence-level annotations. - for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length; sentenceIndex++) { - Sentence sentence = doc.sentences()[sentenceIndex]; + for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length(); sentenceIndex++) { + Sentence sentence = doc.sentences().apply(sentenceIndex); System.out.println("Sentence #" + sentenceIndex + ":"); - System.out.println("Tokens: " + mkString(sentence.words())); - System.out.println("Start character offsets: " + mkString(sentence.startOffsets())); - System.out.println("End character offsets: " + mkString(sentence.endOffsets())); + System.out.println("Tokens: " + mkStringStr(sentence.words())); + System.out.println("Start character offsets: " + mkStringInt(sentence.startOffsets())); + System.out.println("End character offsets: " + mkStringInt(sentence.endOffsets())); // These annotations are optional, so they are stored using Option objects, // hence the isDefined() and get() calls. if (sentence.lemmas().isDefined()) - System.out.println("Lemmas: " + mkString(sentence.lemmas().get())); + System.out.println("Lemmas: " + mkStringStr(sentence.lemmas().get())); if (sentence.tags().isDefined()) - System.out.println("POS tags: " + mkString(sentence.tags().get())); + System.out.println("POS tags: " + mkStringStr(sentence.tags().get())); if (sentence.chunks().isDefined()) - System.out.println("Chunks: " + mkString(sentence.chunks().get())); + System.out.println("Chunks: " + mkStringStr(sentence.chunks().get())); if (sentence.entities().isDefined()) - System.out.println("Named entities: " + mkString(sentence.entities().get())); + System.out.println("Named entities: " + mkStringStr(sentence.entities().get())); if (sentence.norms().isDefined()) - System.out.println("Normalized entities: " + mkString(sentence.norms().get())); + System.out.println("Normalized entities: " + mkStringStr(sentence.norms().get())); if (sentence.dependencies().isDefined()) { System.out.println("Syntactic dependencies:"); Iterator> iterator = @@ -53,27 +54,27 @@ public static void main(String [] args) throws Exception { } } - public static String mkString(String[] strings, String sep) { + public static String mkStringStr(Seq strings, String sep) { StringBuilder stringBuilder = new StringBuilder(); - for (int i = 0; i < strings.length; i ++) { + for (int i = 0; i < strings.length(); i ++) { if (i > 0) stringBuilder.append(sep); - stringBuilder.append(strings[i]); + stringBuilder.append(strings.apply(i)); } return stringBuilder.toString(); } - public static String mkString(String[] strings) { return mkString(strings, " "); } + public static String mkStringStr(Seq strings) { return mkStringStr(strings, " "); } - public static String mkString(int[] ints, String sep) { + public static String mkStringInt(Seq ints, String sep) { StringBuilder stringBuilder = new StringBuilder(); - for (int i = 0; i < ints.length; i ++) { + for (int i = 0; i < ints.length(); i ++) { if (i > 0) stringBuilder.append(sep); - stringBuilder.append(ints[i]); + stringBuilder.append(ints.apply(i)); } return stringBuilder.toString(); } - public static String mkString(int[] ints) { return mkString(ints, " "); } + public static String mkStringInt(Seq ints) { return mkStringInt(ints, " "); } public static Iterable iteratorToIterable(Iterator iterator) { return () -> iterator; } } diff --git a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala index 2789eb0d1..506486e88 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala @@ -2,6 +2,8 @@ package org.clulab.processors.apps import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.processors.clu.BalaurProcessor +import org.clulab.scala.WrappedArrayBuffer._ +import org.clulab.utils.WrappedArraySeq import org.slf4j.{Logger, LoggerFactory} import java.io.InputStream @@ -17,101 +19,106 @@ class ColumnsToDocument * Last Modified: Fix compiler issue: import scala.io.Source. */ object ColumnsToDocument { - val logger:Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument]) + type LabelSetter = (Sentence, Seq[String]) => Sentence + type Annotator = (Document) => Document + val logger: Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument]) val WORD_POS_CONLLX = 1 val TAG_POS_CONLLX = 4 val WORD_POS_CONLLU = 1 val TAG_POS_CONLLU = 3 - var proc:Processor = new BalaurProcessor() + var proc: Processor = new BalaurProcessor() var prevLang: String = "en" - def readFromFile(fn:String, - wordPos:Int = WORD_POS_CONLLX, - labelPos:Int = TAG_POS_CONLLX, - setLabels: (Sentence, Array[String]) => Unit, - annotate: (Document) => Unit, - filterOutContractions:Boolean = false, - lang: String = "en" - ): Document = { - - // redefine proc acording to the language used + protected def setProcessor(lang: String): Unit = { if (lang != prevLang) { if (lang == "pt") { println("Using Portuguese processors") throw new RuntimeException(s"ERROR: language '$lang' not supported!") //this.proc = new PortugueseCluProcessor() - } else if (lang == "es") { + } + else if (lang == "es") { println("Using Spanish processors") //this.proc = new SpanishCluProcessor() throw new RuntimeException(s"ERROR: language '$lang' not supported!") - } else { + } + else { println("Using English processors") this.proc = new BalaurProcessor() } this.prevLang = lang } + } + def readFromFile( + fn: String, + wordPos: Int = WORD_POS_CONLLX, + labelPos: Int = TAG_POS_CONLLX, + setLabels: LabelSetter, + annotate: Annotator, + filterOutContractions: Boolean = false, + lang: String = "en" + ): Document = { + setProcessor(lang) Using.resource(Source.fromFile(fn)) { source => readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) } } - def readFromStream(stream:InputStream, - wordPos:Int = WORD_POS_CONLLX, - labelPos:Int = TAG_POS_CONLLX, - setLabels: (Sentence, Array[String]) => Unit, - annotate: (Document) => Unit, - filterOutContractions:Boolean = false, - lang: String = "en"): Document = { - - // redefine proc acording to the language used - if (lang == "pt"){ - println("Using Portuguese processors") - //this.proc = new PortugueseCluProcessor() - throw new RuntimeException(s"ERROR: language '$lang' not supported!") - } else if(lang == "es") { - println("Using Spanish processors") - //this.proc = new SpanishCluProcessor() - throw new RuntimeException(s"ERROR: language '$lang' not supported!") - } else { - println("Using English processors") - this.proc = new BalaurProcessor() - } - + def readFromStream( + stream: InputStream, + wordPos: Int = WORD_POS_CONLLX, + labelPos: Int = TAG_POS_CONLLX, + setLabels: LabelSetter, + annotate: Annotator, + filterOutContractions: Boolean = false, + lang: String = "en" + ): Document = { + setProcessor(lang) Using.resource(Source.fromInputStream(stream)) { source => readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) } } - def readFromSource(source:Source, - wordPos:Int, - labelPos:Int, - setLabels: (Sentence, Array[String]) => Unit, - annotate: (Document) => Unit, - filterOutContractions:Boolean): Document = { - var words = new ArrayBuffer[String]() - var startOffsets = new ArrayBuffer[Int]() - var endOffsets = new ArrayBuffer[Int]() - var labels = new ArrayBuffer[String]() - var charOffset = 0 + def readFromSource( + source: Source, + wordPos: Int, + labelPos: Int, + setLabels: LabelSetter, + annotate: Annotator, + filterOutContractions:Boolean + ): Document = { + val words = new ArrayBuffer[String]() + val startOffsets = new ArrayBuffer[Int]() + val endOffsets = new ArrayBuffer[Int]() + val labels = new ArrayBuffer[String]() val sentences = new ArrayBuffer[Sentence]() - for(line <- source.getLines()) { - val l = line.trim + var charOffset = 0 + + def mkSentence(): Sentence = { + val wordsSeq = new WrappedArraySeq(words.toArray).toImmutableSeq + val unlabeledSentence = new Sentence(wordsSeq, startOffsets, endOffsets, wordsSeq) + + words.clear() + startOffsets.clear() + endOffsets.clear() + + val labeledSentence = setLabels(unlabeledSentence, labels.toSeq) + + labels.clear() + labeledSentence + } + + source.getLines().map(_.trim).foreach { l => if (l.isEmpty) { // end of sentence if (words.nonEmpty) { - val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray) - setLabels(s, labels.toArray) - sentences += s - words = new ArrayBuffer[String]() - startOffsets = new ArrayBuffer[Int]() - endOffsets = new ArrayBuffer[Int]() - labels = new ArrayBuffer[String]() + sentences += mkSentence() charOffset += 1 } - } else { + } + else { // within the same sentence val bits = l.split("\\s+") if (bits.length < 2) @@ -125,52 +132,28 @@ object ColumnsToDocument { // 10 as o DET _ Gender=Fem|Number=Plur 11 det _ _ // val offset = bits(0) // we assume token offsets are always in column 0! - if(! filterOutContractions || ! offset.contains("-")) { + if (!filterOutContractions || ! offset.contains("-")) { words += bits(wordPos) labels += bits(labelPos) startOffsets += charOffset charOffset = bits(wordPos).length endOffsets += charOffset charOffset += 1 - } else { + } + else { // println("Skipped line: " + l) } } } - if(words.nonEmpty) { - val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray) - s.tags = Some(labels.toArray) - sentences += s - } + if (words.nonEmpty) + sentences += mkSentence() logger.debug(s"Loaded ${sentences.size} sentences.") - val d = new Document(sentences.toArray) - annotate(d) - - d - - } - - def setTags(s:Sentence, tags:Array[String]): Unit = { - s.tags = Some(tags) - } - - def setChunks(s:Sentence, chunks:Array[String]): Unit = { - s.chunks = Some(chunks) - } - - def setEntities(s:Sentence, entities:Array[String]): Unit = { - s.entities = Some(entities) - } - - def annotateLemmas(doc:Document): Unit = { - proc.lemmatize(doc) // some features use lemmas, which are not available in the CoNLL data - } + val unannotatedSentence = new Document(sentences) + val annotatedSentence = annotate(unannotatedSentence) - def annotateLemmmaTags(doc:Document): Unit = { - proc.lemmatize(doc) - proc.tagPartsOfSpeech(doc) + annotatedSentence } - def annotateNil(doc:Document): Unit = {} + def annotateNil(document: Document): Document = document } diff --git a/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala b/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala index 022a59cc0..c0303f6ca 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala @@ -3,7 +3,7 @@ package org.clulab.processors.apps import org.clulab.processors.Document import org.clulab.processors.clu.BalaurProcessor import org.clulab.serialization.CoNLLUSerializer -import org.clulab.utils.{FileUtils, StringUtils} +import org.clulab.utils.{FileUtils, StringUtils, WrappedArraySeq} import java.io.PrintWriter import scala.util.Using @@ -36,7 +36,11 @@ object CommandLineInterface extends App { } else if(props.containsKey(TOKENS)) { // one sentence per line; sentences are tokenized val sents = FileUtils.getLinesFromFile(props.getProperty(INPUT)) - val tokenizedSents = sents.map(_.split("\\s+").toIterable) + val tokenizedSents = sents.map { sent => + val tokens = sent.split("\\s+") + + WrappedArraySeq(tokens).toImmutableSeq + } proc.annotateFromTokens(tokenizedSents) } else { // assume raw text diff --git a/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala b/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala deleted file mode 100644 index fdba5a609..000000000 --- a/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala +++ /dev/null @@ -1,10 +0,0 @@ -package org.clulab.processors.apps - -import org.clulab.numeric.EvalTimeNorm -import org.clulab.processors.clu.BalaurProcessor - -object EvalTimeNormApp extends App { - val proc = new BalaurProcessor() - - EvalTimeNorm.run(proc) -} diff --git a/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala b/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala index f320b6f4b..6220f625e 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala @@ -2,23 +2,21 @@ package org.clulab.processors.apps import org.clulab.processors.Document import org.clulab.processors.Processor +import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter} import org.clulab.serialization.DocumentSerializer import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer} -import java.io.BufferedOutputStream import java.io.File -import java.io.FileOutputStream import java.io.PrintWriter +import scala.collection.compat._ import scala.collection.parallel.ParSeq import scala.util.Using -import org.clulab.processors.clu.BalaurProcessor object InfiniteParallelProcessorsExample { class ProcessorProvider(reuseProcessor: Boolean) { protected val processorOpt: Option[Processor] = - if (reuseProcessor) Some(new BalaurProcessor()) - else None + Option.when(reuseProcessor)(new BalaurProcessor()) def newOrReusedProcessor: Processor = if (reuseProcessor) processorOpt.get @@ -37,9 +35,6 @@ object InfiniteParallelProcessorsExample { val documentSerializer = new DocumentSerializer def processFiles(parFiles: ParSeq[File], processor: Processor): Unit = { - - def printDocument(document: Document, printWriter: PrintWriter): Unit = document.prettyPrint(printWriter) - parFiles.foreach { file => println(s"Processing ${file.getName}...") @@ -47,7 +42,7 @@ object InfiniteParallelProcessorsExample { val outputFile = new File(outputDir + "/" + file.getName) val document = processor.annotate(text) val printedDocument = StringUtils.viaPrintWriter { printWriter => - printDocument(document, printWriter) + new DocumentPrettyPrinter(printWriter).print(document) } val savedDocument = documentSerializer.save(document) val outputDocument = printedDocument + savedDocument diff --git a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala index 0009b0f04..d4ddd3cb4 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala @@ -1,6 +1,6 @@ package org.clulab.processors.apps -import org.clulab.numeric.{displayMentions, setLabelsAndNorms} +import org.clulab.numeric.NumericUtils import org.clulab.processors.clu.BalaurProcessor import org.clulab.utils.ReloadableProcessor import org.clulab.utils.ReloadableShell @@ -23,9 +23,8 @@ class ReloadableNumericProcessor(ruleDirOpt: Option[String]) extends ReloadableP val numericEntityRecognizerOpt = balaurProcessor .numericEntityRecognizerOpt .map(_.reloaded(new File(ruleDirOpt.get))) - val numericEntityRecognizerOptOpt = numericEntityRecognizerOpt.map(Option(_)) - processorOpt = Some(balaurProcessor.copy(numericEntityRecognizerOptOpt = numericEntityRecognizerOptOpt)) + processorOpt = Some(balaurProcessor.copy(numericEntityRecognizerOpt = numericEntityRecognizerOpt)) } } @@ -35,10 +34,12 @@ class NumericEntityRecognizerShell(ruleDirOpt: Option[String]) extends Reloadabl /** The actual work, including printing out the output */ def work(text: String): Unit = { val doc = proc.get.annotate(text) + // This gets the same numericEntityRecognizer already used in the annotation + // so that the mentions, since thrown away, can be recalculated. val mentions = proc.get.numericEntityRecognizerOpt.map(_.extractFrom(doc)).getOrElse(Seq.empty) - setLabelsAndNorms(doc, mentions) - displayMentions(mentions, doc) + // The doc should already have been annotated two lines above. + NumericUtils.displayMentions(mentions, doc) } def reload(): Unit = { diff --git a/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala b/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala index 09440b813..106f7d09b 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala @@ -6,6 +6,7 @@ import org.clulab.sequences.LexiconNER import org.clulab.utils.FileUtils import java.io.File +import scala.collection.compat._ object OdinStarter extends App { // When using an IDE rather than sbt, make sure the working directory for the run @@ -20,11 +21,11 @@ object OdinStarter extends App { val kbs = kbsAndCaseInsensitiveMatchings.map(_._1) val caseInsensitiveMatchings = kbsAndCaseInsensitiveMatchings.map(_._2) val isLocal = kbs.forall(new File(resourceDir, _).exists) - val baseDirOpt = if (isLocal) Some(resourceDir) else None + val baseDirOpt = Option.when(isLocal)(resourceDir) LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val masterResource = "/org/clulab/odinstarter/main.yml" // We usually want to reload rules during development, diff --git a/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala b/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala index 6e514c3e1..bc28048e5 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala @@ -2,7 +2,7 @@ package org.clulab.processors.apps import org.clulab.processors.Document import org.clulab.processors.Processor -import org.clulab.processors.clu.BalaurProcessor +import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter} import org.clulab.serialization.DocumentSerializer import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer} @@ -13,9 +13,6 @@ import scala.util.Using object ParallelProcessorsExample { def mainWithCallback(args: Array[String])(callback: (File, String) => Unit): Unit = { - - def printDocument(document: Document, printWriter: PrintWriter): Unit = document.prettyPrint(printWriter) - val inputDir = args(0) val outputDir = args(1) val extension = args(2) @@ -56,7 +53,7 @@ object ParallelProcessorsExample { throw throwable } val printedDocument = StringUtils.viaPrintWriter { printWriter => - printDocument(document, printWriter) + new DocumentPrettyPrinter(printWriter).print(document) } val savedDocument = documentSerializer.save(document) val outputDocument = printedDocument + savedDocument diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala index 97a990764..b92b75129 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala @@ -30,7 +30,7 @@ object ProcessCoNLL03 extends App { } } - def saveSent(pw: PrintWriter, sent: Array[Row], tags: Option[Array[String]] = None, chunks: Option[Array[String]] = None): Unit = { + def saveSent(pw: PrintWriter, sent: Array[Row], tags: Option[Seq[String]] = None, chunks: Option[Seq[String]] = None): Unit = { if (tags.isDefined) { assert(sent.length == tags.get.length) //println("Using generated POS tags") diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala index 8bc6aa608..518e0f667 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala @@ -1,5 +1,6 @@ package org.clulab.processors.apps +import org.clulab.processors.clu.DocumentPrettyPrinter import org.clulab.processors.{Document, Processor} import org.clulab.serialization.DocumentSerializer @@ -13,6 +14,7 @@ import java.io.PrintWriter */ object ProcessorsDocSerializerExample { def main(args:Array[String]): Unit = { + val documentPrinter = new DocumentPrettyPrinter(new PrintWriter(System.out)) // create the processor val proc = Processor() @@ -20,14 +22,11 @@ object ProcessorsDocSerializerExample { val doc = proc.annotate("John Smith went to China. He visited Beijing, on January 10th, 2013.") // you are basically done. the rest of this code simply prints out the annotations - printDoc(doc) + documentPrinter.print(doc) // serialize the doc using our custom serializer val ser = new DocumentSerializer val out = ser.save(doc) println("SERIALIZED DOC:\n" + out) } - - def printDoc(doc:Document): Unit = { doc.prettyPrint(new PrintWriter(System.out)) } - } diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala index 8f8dc65e1..fb203652f 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala @@ -34,5 +34,5 @@ object ProcessorsScalaExample extends App { println() } - def mkString[T](elems: Array[T]): String = elems.mkString(" ") + def mkString[T](elems: Seq[T]): String = elems.mkString(" ") } diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala index 012949e4a..903cf6113 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala @@ -1,7 +1,7 @@ package org.clulab.processors.apps import org.clulab.processors.Processor -import org.clulab.processors.clu.BalaurProcessor +import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter} import org.clulab.utils.CliReader import org.clulab.utils.ExitMenuItem import org.clulab.utils.HelpMenuItem @@ -27,6 +27,7 @@ class ProcessorsShell extends Shell { val lineReader = new CliReader(proc.prompt, "user.home", ".processorshellhistory") val printWriter = new PrintWriter(System.out) + val documentPrinter = new DocumentPrettyPrinter(printWriter) def prepareProcessor(message: String, promptedReloadableProcessor: PromptedReloadableProcessor): Unit = { lineReader.setPrompt(promptedReloadableProcessor.prompt) @@ -40,8 +41,8 @@ class ProcessorsShell extends Shell { override def work(text: String): Unit = { val doc = proc.get.annotate(text) - doc.prettyPrint(printWriter) - printWriter.flush() + + documentPrinter.print(doc) } // We inherit now just from Shell, so no reloading is performed. diff --git a/build.sbt b/build.sbt index 354fc7974..69a37d8d2 100644 --- a/build.sbt +++ b/build.sbt @@ -1,23 +1,24 @@ -// These were last checked on 2025-02-19. +// These were last checked on 2025-05-09. val scala211 = "2.11.12" // up to 2.11.12 -val scala212 = "2.12.19" // up to 2.12.20 -val scala213 = "2.13.14" // up to 2.13.16 +val scala212 = "2.12.20" // up to 2.12.20 +val scala213 = "2.13.16" // up to 2.13.16 val scala30 = "3.0.2" // up to 3.0.2 val scala31 = "3.1.3" // up to 3.1.3 val scala32 = "3.2.2" // up to 3.2.2 -val scala33 = "3.3.5" // up to 3.3.5 (LTS) +val scala33 = "3.3.6" // up to 3.3.6 (LTS) val scala34 = "3.4.3" // up to 3.4.3 val scala35 = "3.5.2" // up to 3.5.2 -val scala36 = "3.6.3" // up to 3.6.3 +val scala36 = "3.6.4" // up to 3.6.4 +val scala37 = "3.7.0" // up to 3.7.0 // See https://www.scala-lang.org/blog/2022/08/17/long-term-compatibility-plans.html. // Scala30: "If you are maintaining a library, you should drop Scala 3.0." Dropped. // Scala31: This is a LTS (long term support) version before it was called that. // Scala32: This is for experimentation, as in Scala Next, and not for release. // Scala33: This is the first official LTS, but hold off until necessary. -val scala3 = scala31 +val scala3 = scala33 -ThisBuild / crossScalaVersions := Seq(scala212, scala211, scala213, scala3) +ThisBuild / crossScalaVersions := Seq(scala212, scala3, scala213, scala211) ThisBuild / scalaVersion := crossScalaVersions.value.head lazy val root = (project in file(".")) @@ -46,6 +47,6 @@ lazy val webapp = project ) lazy val debugger = project - .dependsOn(library % "compile -> compile; test -> test") + .dependsOn(library % "compile -> compile; test -> test") addCommandAlias("dockerizeWebapp", ";webapp/docker:publishLocal") diff --git a/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala b/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala index 6924eef03..f55ff1b9b 100644 --- a/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala +++ b/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala @@ -31,7 +31,7 @@ object DebuggingOdinStarterApp extends App { LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) } - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val exampleGlobalAction = (inMentions: Seq[Mention], state: State) => { val outMentions = inMentions.map { mention => if (mention.words.length % 2 == 0) diff --git a/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala b/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala index 4a8866a2e..ff7a632aa 100644 --- a/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala +++ b/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala @@ -18,8 +18,8 @@ class HtmlSentenceVisualizer extends SentenceVisualizer with HtmlVisualizing { string } - def getOrEmpty(arrayOpt: Option[Array[String]], index: Int): String = - arrayOpt.map(_(index)).getOrElse("") + def getOrEmpty(seqOpt: Option[Seq[String]], index: Int): String = + seqOpt.map(_(index)).getOrElse("") val rows = sentence.words.indices.map { i => tr( diff --git a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala index 7534c6415..b88f678b1 100644 --- a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala +++ b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala @@ -20,7 +20,7 @@ class DebugRelationGraphExtractor extends DebugTest { val resourceDir: File = new File(resourceDirName) val customLexiconNer = LexiconNER(Seq(s"$baseResourceName/FOOD.tsv"), Seq(true), Some(resourceDir)) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val document = processor.annotate("John eats cake.", keepText = true) val sentence = document.sentences.head val ruleName = "people-eat-food" diff --git a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala index 860646c06..97f8c4631 100644 --- a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala +++ b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala @@ -20,7 +20,7 @@ class DebugTriggerMentionGraphExtractor extends DebugTest { val resourceDir: File = new File(resourceDirName) val customLexiconNer = LexiconNER(Seq(s"$baseResourceName/FOOD.tsv"), Seq(true), Some(resourceDir)) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val document = processor.annotate("John eats cake.", keepText = true) val sentence = document.sentences.head val ruleName = "people-eat-food" diff --git a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala index 7aa28848b..31447ac66 100644 --- a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala +++ b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala @@ -20,7 +20,7 @@ class DebugTriggerPatternGraphExtractor extends DebugTest { val resourceDir: File = new File(resourceDirName) val customLexiconNer = LexiconNER(Seq(s"$baseResourceName/FOOD.tsv"), Seq(true), Some(resourceDir)) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val document = processor.annotate("John eats cake.", keepText = true) val sentence = document.sentences.head val ruleName = "people-eat-food" diff --git a/library/build.sbt b/library/build.sbt index a562a04b5..9b2e770d0 100644 --- a/library/build.sbt +++ b/library/build.sbt @@ -54,7 +54,7 @@ libraryDependencies ++= { "org.scalatest" %% "scalatest" % "3.2.15" % Test, // up to 3.2.19, Apache-2.0 // for odin "org.apache.commons" % "commons-text" % "1.1", // up to 1.12.0, Apache-2.0 - "org.scala-lang.modules" %% "scala-collection-compat" % "2.11.0", // up to 2.12.0 // Apache-2.0 + "org.scala-lang.modules" %% "scala-collection-compat" % "2.13.0", // up to 2.13.0 // Apache-2.0 "org.scala-lang.modules" %% "scala-parser-combinators" % combinatorsVersion, // Apache-2.0 "org.yaml" % "snakeyaml" % "1.14", // up to 2.2, Apache-2.0 // progress bar for training diff --git a/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala b/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala new file mode 100644 index 000000000..649887166 --- /dev/null +++ b/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala @@ -0,0 +1,5 @@ +package org.clulab.scala + +object SeqView { + type Type[T] = scala.collection.SeqView[T, Seq[T]] +} diff --git a/library/src/main/scala-2.11_2.12/org/clulab/scala/package.scala b/library/src/main/scala-2.11_2.12/org/clulab/scala/package.scala deleted file mode 100644 index a6a43654c..000000000 --- a/library/src/main/scala-2.11_2.12/org/clulab/scala/package.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.clulab - -import _root_.scala.{BufferedIterator => GenericBufferedIterator} -import _root_.scala.collection.immutable.{Stream => ImmutableStream} - -package object scala { - type BufferedIterator[T] = GenericBufferedIterator[T] - - type LazyList[T] = ImmutableStream[T] - val LazyList = ImmutableStream -} diff --git a/library/src/main/scala-2.11_2.12/org/clulab/struct/DependencyMap.scala b/library/src/main/scala-2.11_2.12/org/clulab/struct/DependencyMap.scala deleted file mode 100644 index d9b2cbfc5..000000000 --- a/library/src/main/scala-2.11_2.12/org/clulab/struct/DependencyMap.scala +++ /dev/null @@ -1,12 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -class DependencyMap protected extends mutable.HashMap[Int, DirectedGraph[String]] { - override def initialSize: Int = 2 // we have very few dependency types, so let's create a small hash to save memory -} - -object DependencyMap extends DependencyMapNames { - - def apply(): DependencyMap = new DependencyMap() -} diff --git a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala deleted file mode 100644 index 57ad2411e..000000000 --- a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala +++ /dev/null @@ -1,17 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -class GraphMap protected extends mutable.HashMap[String, DirectedGraph[String]] { - override def initialSize: Int = 2 // we have very few dependency types, so let's create a small hash to save memory -} - -object GraphMap extends GraphMapNames { - - def apply(): GraphMap = new GraphMap() - - def apply(existing: Map[String, DirectedGraph[String]]): GraphMap = { - val gm = GraphMap() - gm ++= existing - } -} diff --git a/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala b/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala new file mode 100644 index 000000000..e227e7cbb --- /dev/null +++ b/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala @@ -0,0 +1,5 @@ +package org.clulab.scala + +object SeqView { + type Type[T] = scala.collection.View[T] +} diff --git a/library/src/main/scala-2.13/org/clulab/scala/package.scala b/library/src/main/scala-2.13/org/clulab/scala/package.scala deleted file mode 100644 index 8df18bbdf..000000000 --- a/library/src/main/scala-2.13/org/clulab/scala/package.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.clulab - -import _root_.scala.collection.{BufferedIterator => GenericBufferedIterator} -import _root_.scala.collection.immutable.{LazyList => ImmutableLazyList} - -package object scala { - type BufferedIterator[T] = GenericBufferedIterator[T] - - type LazyList[T] = ImmutableLazyList[T] - val LazyList = ImmutableLazyList -} diff --git a/library/src/main/scala-2.13/org/clulab/struct/DependencyMap.scala b/library/src/main/scala-2.13/org/clulab/struct/DependencyMap.scala deleted file mode 100644 index c4ed49b82..000000000 --- a/library/src/main/scala-2.13/org/clulab/struct/DependencyMap.scala +++ /dev/null @@ -1,14 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object DependencyMap extends DependencyMapNames { - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value. - type DependencyMap = mutable.HashMap[String, DirectedGraph[String]] - - def apply(): DependencyMap = { - // we have very few dependency types, so let's create a small hash to save memory. - new DependencyMap(2, mutable.HashMap.defaultLoadFactor) - } -} diff --git a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala deleted file mode 100644 index fd1b32794..000000000 --- a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala +++ /dev/null @@ -1,20 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object GraphMap extends GraphMapNames { - - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value - type GraphMap = mutable.HashMap[String, DirectedGraph[String]] - - def apply(): GraphMap = { - // we have very few dependency types, so let's create a small hash to save memory. - new GraphMap(2, mutable.HashMap.defaultLoadFactor) - } - - def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMap = { - val gm = GraphMap() - gm ++= existing - } -} diff --git a/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala b/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala deleted file mode 100644 index fa9dfa73d..000000000 --- a/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala +++ /dev/null @@ -1,67 +0,0 @@ -package org.clulab.odinstarter - -import org.clulab.odin.ExtractorEngine -import org.clulab.odin.Mention -import org.clulab.processors.clu.BalaurProcessor -import org.clulab.sequences.LexiconNER -import org.clulab.utils.FileUtils - -import java.io.File - -object OdinStarter3: - - // From sbt use "runMain org.clulab.odinstarter.main". - @main def main() = - // When using an IDE rather than sbt, make sure the working directory for the run - // configuration is the subproject directory so that this resourceDir is accessible. - val resourceDir: File = new File("./src/main/resources") - val customLexiconNer = // i.e., Named Entity Recognizer - val kbsAndCaseInsensitiveMatchings: Seq[(String, Boolean)] = Seq( - // You can add additional kbs (knowledge bases) and caseInsensitiveMatchings here. - ("org/clulab/odinstarter/FOOD.tsv", true) // , - // ("org/clulab/odinstarter/RESTAURANTS.tsv", false) - ) - val kbs = kbsAndCaseInsensitiveMatchings.map(_._1) - val caseInsensitiveMatchings = kbsAndCaseInsensitiveMatchings.map(_._2) - val isLocal = kbs.forall(new File(resourceDir, _).exists) - val baseDirOpt = if isLocal then Some(resourceDir) else None - - LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) - val extractorEngine = - val masterResource = "/org/clulab/odinstarter/main.yml" - // We usually want to reload rules during development, - // so we try to load them from the filesystem first, then jar. - // The resource must start with /, but the file probably shouldn't. - val masterFile = new File(resourceDir, masterResource.drop(1)) - - if masterFile.exists then - // Read rules from file in filesystem. - val rules = FileUtils.getTextFromFile(masterFile) - ExtractorEngine(rules, ruleDir = Some(resourceDir)) - else - // Read rules from resource in jar. - val rules = FileUtils.getTextFromResource(masterResource) - ExtractorEngine(rules, ruleDir = None) - val document = processor.annotate("John eats cake.") - val mentions = extractorEngine.extractFrom(document).sortBy(_.arguments.size) - - for mention <- mentions - do printMention(mention) - - def printMention(mention: Mention, nameOpt: Option[String] = None, depth: Int = 0): Unit = - val indent = " " * depth - val name = nameOpt.getOrElse("") - val labels = mention.labels - val words = mention.sentenceObj.words - val tokens = mention.tokenInterval.map(mention.sentenceObj.words) - - println(indent + " Name: " + name) - println(indent + " Labels: " + labels.mkString(" ")) - println(indent + " Sentence: " + words.mkString(" ")) - println(indent + " Tokens: " + tokens.mkString(" ")) - if mention.arguments.nonEmpty then - println(indent + "Arguments:") - for (name, mentions) <- mention.arguments; mention <- mentions - do printMention(mention, Some(name), depth + 1) - println() diff --git a/library/src/main/scala-3/org/clulab/scala/SeqView.scala b/library/src/main/scala-3/org/clulab/scala/SeqView.scala new file mode 100644 index 000000000..e227e7cbb --- /dev/null +++ b/library/src/main/scala-3/org/clulab/scala/SeqView.scala @@ -0,0 +1,5 @@ +package org.clulab.scala + +object SeqView { + type Type[T] = scala.collection.View[T] +} diff --git a/library/src/main/scala-3/org/clulab/scala/package.scala b/library/src/main/scala-3/org/clulab/scala/package.scala deleted file mode 100644 index 8df18bbdf..000000000 --- a/library/src/main/scala-3/org/clulab/scala/package.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.clulab - -import _root_.scala.collection.{BufferedIterator => GenericBufferedIterator} -import _root_.scala.collection.immutable.{LazyList => ImmutableLazyList} - -package object scala { - type BufferedIterator[T] = GenericBufferedIterator[T] - - type LazyList[T] = ImmutableLazyList[T] - val LazyList = ImmutableLazyList -} diff --git a/library/src/main/scala-3/org/clulab/struct/DependencyMap.scala b/library/src/main/scala-3/org/clulab/struct/DependencyMap.scala deleted file mode 100644 index c4ed49b82..000000000 --- a/library/src/main/scala-3/org/clulab/struct/DependencyMap.scala +++ /dev/null @@ -1,14 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object DependencyMap extends DependencyMapNames { - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value. - type DependencyMap = mutable.HashMap[String, DirectedGraph[String]] - - def apply(): DependencyMap = { - // we have very few dependency types, so let's create a small hash to save memory. - new DependencyMap(2, mutable.HashMap.defaultLoadFactor) - } -} diff --git a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala b/library/src/main/scala-3/org/clulab/struct/GraphMap.scala deleted file mode 100644 index fd1b32794..000000000 --- a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala +++ /dev/null @@ -1,20 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object GraphMap extends GraphMapNames { - - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value - type GraphMap = mutable.HashMap[String, DirectedGraph[String]] - - def apply(): GraphMap = { - // we have very few dependency types, so let's create a small hash to save memory. - new GraphMap(2, mutable.HashMap.defaultLoadFactor) - } - - def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMap = { - val gm = GraphMap() - gm ++= existing - } -} diff --git a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala index add58c14d..9804190fe 100644 --- a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala +++ b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala @@ -2,6 +2,7 @@ package org.clulab.numeric import org.clulab.numeric.mentions.Norm import org.clulab.processors.Processor +import org.clulab.processors.clu.BalaurProcessor import java.nio.charset.StandardCharsets import scala.io.Source @@ -9,9 +10,12 @@ import scala.util.Using object EvalTimeNorm { - def runEval(proc: Processor, ner: NumericEntityRecognizer, - testFile: String): Double = { - val timeNormEvalDir = "/org/clulab/numeric/TimeNormEvalSet" + def runEval( + proc: Processor, + timeNormEvalDir: String, + testFile: String, + ner: NumericEntityRecognizer + ): Double = { val goldStream = getClass.getResourceAsStream(s"$timeNormEvalDir/$testFile") val goldLines = Source.fromInputStream(goldStream).getLines() // Build a Map with the gold time expressions. @@ -34,8 +38,9 @@ object EvalTimeNorm { } val doc = proc.annotate(docText) val mentions = ner.extractFrom(doc) - setLabelsAndNorms(doc, mentions) - val prediction = mentions.collect{ + // The following line does not change the document. + // NumericUtils.mkLabelsAndNorms(doc, mentions) + val prediction = mentions.collect{ case m: Norm if m.neLabel.equals("DATE") || m.neLabel.equals("DATE-RANGE") => (m.startOffset.toString, m.endOffset.toString, m.neNorm) }.toSet @@ -53,13 +58,9 @@ object EvalTimeNorm { fscore } - def run(proc: Processor): Double = { - val ner = NumericEntityRecognizer() + def run(proc: BalaurProcessor, timeNormEvalDir: String, testFile: String): Double = { + val ner = proc.numericEntityRecognizerOpt.get - test(proc, ner) - } - - def test(proc: Processor, ner: NumericEntityRecognizer): Double = { - runEval(proc, ner, "WorldModelersDatesRangesTimex.csv") + runEval(proc, timeNormEvalDir, testFile, ner) } } diff --git a/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala b/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala index 3d5976a7d..73cc1940d 100644 --- a/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala +++ b/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala @@ -19,41 +19,29 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions new NumericEntityRecognizer(lexiconNer, actions, extractorEngine) } - /** Matches the lexicon NER on this document, setting the `entities` field */ - def matchLexiconNer(document: Document): Seq[Option[Array[String]]] = { - val originalEntities = new ArrayBuffer[Option[Array[String]]]() - - for(sent <- document.sentences) { - originalEntities += sent.entities - - val labels = lexiconNer.find(sent) - // this needs to happen in place, otherwise Odin does not see these labels - // we will restore the original Sentence.entities at the end in `extractFrom` - sent.entities = Some(labels) - // println(s"ENTITIES: ${sent.entities.get.mkString(" ")}") - } - - originalEntities - } - /** * Entry point for numeric entity recognition * @param doc Input document * @return sets in place the sequence of NER labels and sequence of NER norms (using the TempEval-2 notation) */ - def extractFrom(doc:Document): Seq[Mention] = { - // dictionaries - val originalEntities = matchLexiconNer(doc) - // grammars - var mentions = extractor.extractFrom(doc) + def extractFrom(doc: Document): Seq[Mention] = { + val newSentences = doc.sentences.map { sentence => + val newEntities = lexiconNer.find(sentence) + + sentence.copy(entities = Some(newEntities)) + } + val newDocument = doc.copy(sentences = newSentences) + val mentions = { + val dirtyMentions = extractor.extractFrom(newDocument) + val cleanMentions = actions.cleanupAction(dirtyMentions) - // restore the original entities - for(i <- originalEntities.indices) { - doc.sentences(i).entities = originalEntities(i) + cleanMentions } - // global actions *after* all grammars are done - actions.cleanupAction(mentions) + // These mentions will have doc pointing to the newDocument, + // but sentence will be the index into the new sentences and + // will be valid for the original doc. + mentions } } diff --git a/library/src/main/scala/org/clulab/numeric/NumericUtils.scala b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala new file mode 100644 index 000000000..ba9bcd84b --- /dev/null +++ b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala @@ -0,0 +1,144 @@ +package org.clulab.numeric + +import org.clulab.numeric.actions.NumericActions +import org.clulab.numeric.mentions.Norm +import org.clulab.odin.{EventMention, Mention} +import org.clulab.processors.Document +import org.clulab.struct.Interval +import org.clulab.utils.WrappedArraySeq + +import scala.collection.mutable + +object NumericUtils { + def displayMentions(mentions: Seq[Mention], doc: Document): Unit = { + val mentionsBySentence = mentions.groupBy(_.sentence).map { case (sentence, mentions) => + sentence -> mentions.sortBy(_.start) + }.withDefaultValue(Nil) + for ((s, i) <- doc.sentences.zipWithIndex) { + println(s"sentence #$i") + println(s.getSentenceText) + println("Tokens: " + s.words.indices.zip(s.words).zip(s.tags.get).mkString(", ")) + s.tags foreach (x => println("Tags: " + x.mkString(", "))) + s.entities foreach (x => println("Entities: " + x.mkString(", "))) + s.norms foreach (x => println("Norms: " + x.mkString(", "))) + println() + + val sortedMentions = mentionsBySentence(i).sortBy(_.label) + sortedMentions foreach displayMention + println() + } + } + + def displayMention(mention: Mention): Unit = { + val boundary = s"\t${"-" * 30}" + println(s"${mention.labels} => ${mention.text}") + println(boundary) + println(s"\tRule => ${mention.foundBy}") + val mentionType = mention.getClass.toString.split("""\.""").last + println(s"\tType => $mentionType") + println(s"\tInterval => ${mention.tokenInterval}") + mention match { + case norm: Norm => + println(s"\tNorm => ${norm.neNorm}") + println(s"\tNE => ${norm.neLabel}") + case _ => + } + println(boundary) + if (mention.arguments.nonEmpty) { + println("\tArgs:") + mention match { + case em: EventMention => + println(s"\ttrigger: ${em.trigger}") + displayArguments(em) + case _ => + displayArguments(mention) + } + println(s"$boundary") + } + println() + } + + def displayArguments(b: Mention): Unit = { + b.arguments foreach { + case (argName, ms) => + ms foreach { v => + println(s"\t * $argName ${v.labels.mkString("(", ", ", ")")} => ${v.text}") + } + } + } + + /** + * Sets the entities and norms fields in each Sentence based on the given numeric mentions + * @param doc This document is modified in place + * @param mentions The numeric mentions previously extracted + */ + def mkLabelsAndNorms(doc: Document, mentions: Seq[Mention]): (Seq[Seq[String]], Seq[Seq[String]]) = { + val pertinentMentions = mentions.collect { + case mention: Norm if NumericActions.isNumeric(mention) => mention + } + val mentionsBySentenceIndex = pertinentMentions.groupBy { mention => mention.sentence } + val zippedLabelsAndNorms = doc.sentences.zipWithIndex.map { case (sentence, index) => + val mentions = mentionsBySentenceIndex.getOrElse(index, Seq.empty) + + if (mentions.isEmpty) { + val entities = sentence.entities.getOrElse(WrappedArraySeq(Array.fill(sentence.size)("O")).toImmutableSeq) + val norms = sentence.norms.getOrElse(WrappedArraySeq(Array.fill(sentence.size)("")).toImmutableSeq) + + (entities, norms) + } + else { + val mutableEntities = sentence.entities + .map { entities => Array(entities: _*) } + .getOrElse(Array.fill(sentence.size)("O")) + val mutableNorms = sentence.norms + .map { norms => Array(norms: _*) } + .getOrElse(Array.fill(sentence.size)("")) + + mentions.foreach { mention => + addLabelsAndNorms(mention.neLabel, mention.neNorm, mention.tokenInterval, mutableEntities, mutableNorms) + } + removeOneEntityBeforeAnother(mutableEntities, mutableNorms, "B-LOC", "MEASUREMENT-LENGTH") + + val immutableEntities = WrappedArraySeq(mutableEntities).toImmutableSeq + val immutableNorms = WrappedArraySeq(mutableNorms).toImmutableSeq + (immutableEntities, immutableNorms) + } + } + val unzippedLabelsAndNorms = zippedLabelsAndNorms.unzip + + unzippedLabelsAndNorms + } + + def removeOneEntityBeforeAnother(entities: mutable.Seq[String], norms: mutable.Seq[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { + var triggered = false + + entities.indices.reverse.foreach { index => + val entity = entities(index) + + if (entity == triggerEntity) + triggered = true + else { + if (triggered) + if (entity.endsWith(toBeRemovedShortened)) { + entities(index) = "O" + norms(index) = "" + } + else + triggered = false + } + } + } + + private def addLabelsAndNorms(label: String, norm: String, tokenInt: Interval, entities: mutable.Seq[String], norms: mutable.Seq[String]): Unit = { + // careful here: we may override some existing entities and norms + // but, given that the numeric entity rules tend to be high precision, this is probably Ok... + tokenInt.headOption.foreach { index => + entities(index) = "B-" + label + norms(index) = norm + } + tokenInt.tail.foreach { index => + entities(index) = "I-" + label + norms(index) = norm + } + } +} diff --git a/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala index e2c3fcf97..5d686c2ba 100644 --- a/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala +++ b/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala @@ -252,14 +252,14 @@ class NumericActions(seasonNormalizer: SeasonNormalizer, unitNormalizer: UnitNor /** filter out season homonyms (fall, spring) **/ def postprocessNumericEntities(mentions: Seq[Mention]): Seq[Mention] = { - def prevWordsMatch(words: Array[String], wordIndex: Int): Boolean = { + def prevWordsMatch(words: Seq[String], wordIndex: Int): Boolean = { val prevWords = words.slice(wordIndex - 2, wordIndex).map(_.toLowerCase) prevWords.exists(NumericActions.preSeasons) || prevWords.containsSlice(NumericActions.inThe) } - def contextWordsMatch(words: Array[String], wordIndex: Int): Boolean = { + def contextWordsMatch(words: Seq[String], wordIndex: Int): Boolean = { val window = 5 val contextWords = words.slice(wordIndex - window, wordIndex + window).map(_.toLowerCase) diff --git a/library/src/main/scala/org/clulab/numeric/package.scala b/library/src/main/scala/org/clulab/numeric/package.scala deleted file mode 100644 index 70559d0f9..000000000 --- a/library/src/main/scala/org/clulab/numeric/package.scala +++ /dev/null @@ -1,127 +0,0 @@ -package org.clulab - -import org.clulab.numeric.actions.NumericActions -import org.clulab.numeric.mentions.{DateMention, DateRangeMention, MeasurementMention, Norm, PercentageMention} -import org.clulab.odin.{EventMention, Mention} -import org.clulab.processors.{Document, Sentence} -import org.clulab.struct.Interval -import _root_.scala.util.control.Breaks._ - -package object numeric { - def displayMentions(mentions: Seq[Mention], doc: Document): Unit = { - val mentionsBySentence = mentions.groupBy(_.sentence).map { case (sentence, mentions) => - sentence -> mentions.sortBy(_.start) - }.withDefaultValue(Nil) - for ((s, i) <- doc.sentences.zipWithIndex) { - println(s"sentence #$i") - println(s.getSentenceText) - println("Tokens: " + s.words.indices.zip(s.words).zip(s.tags.get).mkString(", ")) - s.tags foreach (x => println("Tags: " + x.mkString(", "))) - s.entities foreach (x => println("Entities: " + x.mkString(", "))) - s.norms foreach (x => println("Norms: " + x.mkString(", "))) - println() - - val sortedMentions = mentionsBySentence(i).sortBy(_.label) - sortedMentions foreach displayMention - println() - } - } - - def displayMention(mention: Mention): Unit = { - val boundary = s"\t${"-" * 30}" - println(s"${mention.labels} => ${mention.text}") - println(boundary) - println(s"\tRule => ${mention.foundBy}") - val mentionType = mention.getClass.toString.split("""\.""").last - println(s"\tType => $mentionType") - println(s"\tInterval => ${mention.tokenInterval}") - mention match { - case norm: Norm => - println(s"\tNorm => ${norm.neNorm}") - println(s"\tNE => ${norm.neLabel}") - case _ => - } - println(boundary) - if (mention.arguments.nonEmpty) { - println("\tArgs:") - mention match { - case em: EventMention => - println(s"\ttrigger: ${em.trigger}") - displayArguments(em) - case _ => - displayArguments(mention) - } - println(s"$boundary") - } - println() - } - - def displayArguments(b: Mention): Unit = { - b.arguments foreach { - case (argName, ms) => - ms foreach { v => - println(s"\t * $argName ${v.labels.mkString("(", ", ", ")")} => ${v.text}") - } - } - } - - /** - * Sets the entities and norms fields in each Sentence based on the given numeric mentions - * @param doc This document is modified in place - * @param mentions The numeric mentions previously extracted - */ - def setLabelsAndNorms(doc: Document, mentions: Seq[Mention]): Unit = { - // - // initialize entities and norms - // - for (sentence <- doc.sentences) { - sentence.entities = sentence.entities.orElse(Some(Array.fill(sentence.size)("O"))) - sentence.norms = sentence.norms .orElse(Some(Array.fill(sentence.size)(""))) - } - - // - // convert numeric entities to entity labels and norms - // - for(mention <- mentions) { - if(NumericActions.isNumeric(mention) && mention.isInstanceOf[Norm]) { - addLabelsAndNorms(mention.asInstanceOf[Norm], mention.sentenceObj, mention.tokenInterval) - } - } - removeOneEntityBeforeAnother(doc, "B-LOC", "MEASUREMENT-LENGTH") - } - - def removeOneEntityBeforeAnother(doc: Document, triggerEntity: String, toBeRemovedShortened: String): Unit = { - // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' - // toBeRemovedShortened is entity without BIO- - for(s <- doc.sentences) { - val zippedEntities = s.entities.get.zipWithIndex - for ((e, i) <- zippedEntities) { - if (i > 0 && e == triggerEntity && s.entities.get(i-1).endsWith(toBeRemovedShortened)) { - s.entities.get(i - 1) = "O" - // go in reverse replacing indices and norms in the immediate preceding mention - breakable { - for ((en, j) <- zippedEntities.slice(0, i ).reverse) { - if (en.endsWith(toBeRemovedShortened)) { - s.entities.get(j) = "O" - s.norms.get(j) = "" - } else break() - } - } - } - } - } - } - - private def addLabelsAndNorms(m: Norm, s: Sentence, tokenInt: Interval): Unit = { - var first = true - val norm = m.neNorm - // careful here: we may override some existing entities and norms - // but, given that the numeric entity rules tend to be high precision, this is probably Ok... - for(i <- tokenInt.indices) { - val prefix = if(first) "B-" else "I-" - s.entities.get(i) = prefix + m.neLabel - s.norms.get(i) = norm - first = false - } - } -} diff --git a/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala b/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala index f6e282934..26c2252e8 100644 --- a/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala +++ b/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala @@ -3,6 +3,7 @@ package org.clulab.odin.impl import org.clulab.odin.impl.MarkdownGeneration._ import org.clulab.odin.impl.RuleReader.{DefaultAction, Rule} +import scala.collection.compat._ import scala.collection.mutable.ArrayBuffer case class RuleSchema( @@ -180,7 +181,7 @@ object MarkdownGeneration { extractorType = "CrossSentenceExtractor", labels = x.labels, priority = priorityString(x.priority), - action = if (r.action != DefaultAction) Some(r.action) else None, + action = Option.when(r.action != DefaultAction)(r.action), keep = x.keep, additional = Map( "leftWindow" -> x.leftWindow.toString, @@ -198,7 +199,7 @@ object MarkdownGeneration { extractorType = "TokenExtractor", labels = x.labels, priority = priorityString(x.priority), - action = if (r.action != DefaultAction) Some(r.action) else None, + action = Option.when(r.action != DefaultAction)(r.action), keep = x.keep, additional = Map.empty, arguments = Seq.empty @@ -213,7 +214,7 @@ object MarkdownGeneration { extractorType = "GraphExtractor", labels = x.labels, priority = priorityString(x.priority), - action = if (r.action != DefaultAction) Some(r.action) else None, + action = Option.when(r.action != DefaultAction)(r.action), keep = x.keep, additional = Map.empty, arguments = toArgSchema(x.pattern.arguments) diff --git a/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala b/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala index f6b8c2c7c..817d93d52 100644 --- a/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala +++ b/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala @@ -1,6 +1,7 @@ package org.clulab.odin.impl import java.io.{BufferedInputStream, InputStream} +import scala.collection.compat._ import scala.io.Source /** @@ -22,8 +23,7 @@ object OdinResourceManager { val embeddingsOption: Option[OdinResource] = constructorMap("embeddings") // cast as EmbeddingsResources, if present val embeddings: Option[EmbeddingsResource] = - if (embeddingsOption.nonEmpty) Some(embeddingsOption.get.asInstanceOf[EmbeddingsResource]) - else None + Option.when(embeddingsOption.nonEmpty)(embeddingsOption.get.asInstanceOf[EmbeddingsResource]) new OdinResourceManager(embeddings) } diff --git a/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala b/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala index a349b4193..45f9a1e35 100644 --- a/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala +++ b/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala @@ -13,6 +13,7 @@ import java.net.URL import java.nio.charset.Charset import java.nio.charset.StandardCharsets import java.util.{Collection, Map => JMap} +import scala.collection.compat._ import scala.io.{Codec, Source} import scala.jdk.CollectionConverters._ import scala.util.Using @@ -28,8 +29,7 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option private val mirror = new ActionMirror(actions) val ruleYamlOpt = - if (OdinConfig.keepRule) Some(new Yaml(new Constructor(classOf[Map[String, Any]]))) - else None + Option.when(OdinConfig.keepRule)(new Yaml(new Constructor(classOf[Map[String, Any]]))) def read(input: String): Vector[Extractor] = { val rules = getRules(input) diff --git a/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala b/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala index 3afe9794a..96c3d2e57 100644 --- a/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala +++ b/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala @@ -1,7 +1,7 @@ package org.clulab.odin.impl -import org.clulab.scala.LazyList import java.util.{ Collection, Map => JMap } +import scala.collection.compat.immutable.LazyList import scala.jdk.CollectionConverters._ class Taxonomy(parents: Map[String, String]) { diff --git a/library/src/main/scala/org/clulab/odin/impl/Values.scala b/library/src/main/scala/org/clulab/odin/impl/Values.scala index c2e03bbd5..0b78e7f45 100644 --- a/library/src/main/scala/org/clulab/odin/impl/Values.scala +++ b/library/src/main/scala/org/clulab/odin/impl/Values.scala @@ -3,7 +3,7 @@ package org.clulab.odin.impl import org.clulab.processors.Document trait Values { - def values(strings: Option[Array[String]], msg: String): Array[String] = + def values(strings: Option[Seq[String]], msg: String): Seq[String] = strings match { case None => sys.error(msg) case Some(strings) => strings diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index 6435ab94c..1c9e1ece3 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -1,46 +1,52 @@ package org.clulab.processors -import java.io.PrintWriter - -import org.clulab.struct.{CorefChains, DirectedGraphEdgeIterator} +import org.clulab.struct.CorefChains import org.clulab.utils.Hash import org.clulab.utils.Serializer import org.json4s.JString import org.json4s.JValue import org.json4s.jackson.prettyJson -import scala.collection.mutable - /** * Stores all annotations for one document. * Written by: Mihai Surdeanu and Gus Hahn-Powell. * Last Modified: Add apply method to copy Document. */ -class Document(val sentences: Array[Sentence]) extends Serializable { - +class Document( + val sentences: Seq[Sentence], /** Unique id for this document, if any */ - var id: Option[String] = None - + val id: Option[String] = None, /** Clusters of coreferent mentions */ - var coreferenceChains: Option[CorefChains] = None - + val coreferenceChains: Option[CorefChains] = None, /** The original text corresponding to this document, if it was preserved by the corresponding processor */ - var text: Option[String] = None - + val text: Option[String] = None, /** Map of any arbitrary document attachments such as document creation time */ - protected var attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None - - protected var documentCreationTime:Option[String] = None + val attachments: Option[DocumentAttachments.Type] = None, + /** + * The document creation time using the CoreNLP format + * See useFixedDate here for more details: https://stanfordnlp.github.io/CoreNLP/ner.html#setting-document-date + * The DCT will impact how Sentence.norms are generated for DATE expressions. + */ + val dct: Option[String] = None +) extends Serializable { + + def copy( + sentences: Seq[Sentence] = sentences, + id: Option[String] = id, + coreferenceChains: Option[CorefChains] = coreferenceChains, + text: Option[String] = text, + attachments: Option[DocumentAttachments.Type] = None, + dct: Option[String] = dct + ): Document = new Document(sentences, id, coreferenceChains, text, attachments, dct) /** Clears any internal state potentially constructed by the annotators */ - def clear(): Unit = { } + def clear(): Unit = { } // This is for subclass support. /** * Used to compare Documents. * @return a hash (Int) based primarily on the sentences, ignoring attachments */ def equivalenceHash: Int = { - val stringCode = "org.clulab.processors.Document" // Hash representing the sentences. @@ -66,133 +72,6 @@ class Document(val sentences: Array[Sentence]) extends Serializable { Hash.ordered(sentences.map(_.ambivalenceHash)) ) - /** Adds an attachment to the document's attachment map */ - def addAttachment(name: String, attachment: DocumentAttachment): Unit = { - if (attachments.isEmpty) - attachments = Some(new mutable.HashMap[String, DocumentAttachment]()) - attachments.get += name -> attachment - } - - /** Retrieves the attachment with the given name */ - def getAttachment(name: String): Option[DocumentAttachment] = attachments.flatMap(_.get(name)) - - def removeAttachment(name: String): Unit = attachments.foreach(_ -= name) - - /** Retrieves keys to all attachments so that the entire collection can be read - * for purposes including but not limited to serialization. If there are no - * attachments, that is attachments == None, an empty set is returned. - * This does not distinguish between None and Some(HashMap.empty), especially - * since the latter should not be possible because of the lazy initialization. - */ - def getAttachmentKeys: collection.Set[String] = { - attachments.map { attachments => - attachments.keySet - }.getOrElse(collection.Set.empty[String]) - } - - /** - * Sets the document creation time using the CoreNLP format. - * See useFixedDate here for more details: https://stanfordnlp.github.io/CoreNLP/ner.html#setting-document-date - * The DCT will impacts how Sentence.norms are generated for DATE expressions - * @param dct Document creation time - */ - def setDCT(dct:String): Unit = documentCreationTime = Some(dct) - - def getDCT: Option[String] = documentCreationTime - - def prettyPrint(pw: PrintWriter): Unit = { - // let's print the sentence-level annotations - var sentenceCount = 0 - for (sentence <- sentences) { - pw.println("Sentence #" + sentenceCount + ":") - pw.println("Tokens: " + sentence.words.zipWithIndex.mkString(" ")) - pw.println("Start character offsets: " + sentence.startOffsets.mkString(" ")) - pw.println("End character offsets: " + sentence.endOffsets.mkString(" ")) - - // these annotations are optional, so they are stored using Option objects, hence the foreach statement - sentence.lemmas.foreach(lemmas => pw.println(s"Lemmas: ${lemmas.mkString(" ")}")) - sentence.tags.foreach(tags => pw.println(s"POS tags: ${tags.mkString(" ")}")) - sentence.chunks.foreach(chunks => pw.println(s"Chunks: ${chunks.mkString(" ")}")) - sentence.entities.foreach(entities => pw.println(s"Named entities: ${entities.mkString(" ")}")) - sentence.norms.foreach(norms => pw.println(s"Normalized entities: ${norms.mkString(" ")}")) - sentence.universalBasicDependencies.foreach(dependencies => { - pw.println("Basic syntactic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.universalEnhancedDependencies.foreach(dependencies => { - pw.println("Enhanced syntactic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.semanticRoles.foreach(dependencies => { - pw.println("Semantic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.enhancedSemanticRoles.foreach(dependencies => { - pw.println("Enhanced semantic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.syntacticTree.foreach(tree => { - pw.println("Constituent tree: " + tree.toStringDepth(showHead = false)) - // see the org.clulab.struct.Tree class for more information - // on syntactic trees, including access to head phrases/words - }) - - sentenceCount += 1 - pw.println("\n") - } - - // let's print the coreference chains - coreferenceChains.foreach(chains => { - for (chain <- chains.getChains) { - pw.println("Found one coreference chain containing the following mentions:") - for (mention <- chain) { - // note that all these offsets start at 0 too - pw.println("\tsentenceIndex:" + mention.sentenceIndex + - " headIndex:" + mention.headIndex + - " startTokenOffset:" + mention.startOffset + - " endTokenOffset:" + mention.endOffset + - " text: " + sentences(mention.sentenceIndex).words.slice(mention.startOffset, mention.endOffset).mkString("[", " ", "]")) - } - } - }) - } - - def assimilate(document: Document, textOpt: Option[String]): Document = { - id = document.id - coreferenceChains = document.coreferenceChains - text = textOpt - attachments = document.attachments - documentCreationTime = document.documentCreationTime - this - } - - // sentences are a val, so they must be initialized through the construction of a new Document. - // Thereafter, the remaining values can be assimilated from the old document. The shortcut - // is used so that subclasses don't have to duplicate almost everything in their copy. - def copy(sentences: Array[Sentence] = sentences, textOpt: Option[String] = text): Document = { - new Document(sentences).assimilate(this, textOpt) - } - def offset(offset: Int): Document = // If a subclass of Document constructs itself with an attachment or a documentCreationTime that // would be overwritten on the copy(), then it should provide its own copy() method(s). @@ -202,20 +81,37 @@ class Document(val sentences: Array[Sentence]) extends Serializable { object Document { - def apply(sentences: Array[Sentence]): Document = new Document(sentences) + def apply(sentences: Seq[Sentence]): Document = apply(sentences, text = None) + + def apply(sentences: Seq[Sentence], text: Option[String]): Document = apply(id = None, sentences, coref = None, text) - def apply(id: Option[String], sentences: Array[Sentence], coref: Option[CorefChains], text: Option[String]): Document = { - val d = Document(sentences) - d.id = id - d.coreferenceChains = coref - d.text = text - d + def apply(id: Option[String], sentences: Seq[Sentence], coref: Option[CorefChains], text: Option[String]): Document = { + val document = new Document( + sentences, + id = id, + coreferenceChains = coref, + text = text + ) + + document } - /** Return a new Document with relevant fields copied from the given Document. */ - def apply (doc: Document): Document = - Document(doc.id, doc.sentences, doc.coreferenceChains, doc.text) + /** Return a new Document with some relevant fields copied from the given Document. */ + def apply(doc: Document): Document = + apply(doc.id, doc.sentences, doc.coreferenceChains, doc.text) + + def apply(doc: Document, sentences: Seq[Sentence]): Document = { + val newDocument = new Document( + sentences, + id = doc.id, + coreferenceChains = doc.coreferenceChains, + text = doc.text, + attachments = doc.attachments, + dct = doc.dct + ) + newDocument + } } /** @@ -317,6 +213,11 @@ trait JsonSerializerAble { */ trait DocumentAttachment extends DocumentAble with DocumentSerializerAble with JsonSerializerAble +object DocumentAttachments { + type Type = Map[String, DocumentAttachment] +} + + /** * Designed to store intermediate attachments that are only used to pass information between processor components. * Thus, these do not need to be serialized diff --git a/library/src/main/scala/org/clulab/processors/Processor.scala b/library/src/main/scala/org/clulab/processors/Processor.scala index 9528d613d..e856f20a5 100644 --- a/library/src/main/scala/org/clulab/processors/Processor.scala +++ b/library/src/main/scala/org/clulab/processors/Processor.scala @@ -2,6 +2,8 @@ package org.clulab.processors import org.clulab.processors.clu.BalaurProcessor +import scala.collection.mutable + /** * User: mihais * Date: 3/1/13 @@ -10,7 +12,7 @@ import org.clulab.processors.clu.BalaurProcessor trait Processor { /** Constructs a document of tokens from free text; includes sentence splitting and tokenization. */ - def mkDocument (text:String, keepText:Boolean = false): Document + def mkDocument(text:String, keepText:Boolean = false): Document // The documents here were created with Processor.mkDocument, which could have created a subclassed // Document or documents with certain fields already filled in. This implementation only handles @@ -21,31 +23,35 @@ trait Processor { require(documents.length > 1) val headDocument = documents.head val tailDocuments = documents.tail - val combinedSentences = documents.flatMap(_.sentences).toArray - val combinedDocument = new Document(combinedSentences) val headId = headDocument.id require(tailDocuments.forall(_.id == headId)) - combinedDocument.id = headId - - require(combinedDocument.text.isEmpty) - combinedDocument.text = combinedTextOpt - + val headDctOpt = headDocument.dct + require(documents.tail.forall(_.dct == headDctOpt)) // Coreference chains involve Mentions that include references to documents. The Mentions are being // moved to a new Document and it would be infeasible to move the chains. - require(combinedDocument.coreferenceChains.isEmpty) require(documents.forall(_.coreferenceChains.isEmpty)) - documents.foreach { document => - document.getAttachmentKeys.foreach { attachmentKey => - require(combinedDocument.getAttachment(attachmentKey).forall(_ == document.getAttachment(attachmentKey).get)) - combinedDocument.addAttachment(attachmentKey, document.getAttachment(attachmentKey).get) - } + val allAttachments = documents.flatMap { document => + document.attachments.getOrElse(Map.empty).toSeq } + // This will remove duplicate (key, value) pairs. + val distinctAttachments = allAttachments.distinct + // If for any key, there are different, contradictory values, only one value will make it into the map. + val attachments = distinctAttachments.toMap + + require(attachments.size == distinctAttachments.length, "Attachments can't contradict each other. Each key needs to map onto the same value.") + + val combinedSentences = documents.flatMap(_.sentences) + val combinedDocument = new Document( + sentences = combinedSentences, + id = headId, + coreferenceChains = None, + text = combinedTextOpt, + attachments = Some(attachments), + dct = headDctOpt + ) - val headDctOpt = headDocument.getDCT - require(documents.tail.forall(_.getDCT == headDctOpt)) - headDctOpt.foreach(combinedDocument.setDCT) combinedDocument } @@ -76,16 +82,22 @@ trait Processor { } /** Constructs a document of tokens from an array of untokenized sentences. */ - def mkDocumentFromSentences (sentences:Iterable[String], - keepText:Boolean = false, - charactersBetweenSentences:Int = 1): Document + def mkDocumentFromSentences( + sentences: Iterable[String], + keepText: Boolean = false, + charactersBetweenSentences: Int = 1 + ): Document /** Constructs a document of tokens from an array of tokenized sentences. */ - def mkDocumentFromTokens (sentences:Iterable[Iterable[String]], - keepText:Boolean = false, - charactersBetweenSentences:Int = 1, - charactersBetweenTokens:Int = 1): Document + def mkDocumentFromTokens( + sentences: Iterable[Iterable[String]], + keepText: Boolean = false, + charactersBetweenSentences: Int = 1, + charactersBetweenTokens: Int = 1 + ): Document + /** Lemmatization; modifies the document in place. */ + def lemmatize(words: Seq[String]): Seq[String] // Side-effecting annotations. These modify the document in place, which is not too elegant. // There are two reasons for this: @@ -94,54 +106,54 @@ trait Processor { // (2) It is more efficient during annotate() where all the possible operations are chained. /** Part of speech tagging; modifies the document in place. */ - def tagPartsOfSpeech (doc:Document): Unit - - /** Lematization; modifies the document in place. */ - def lemmatize (doc:Document): Unit + def tagPartsOfSpeech(doc: Document): Unit /** Named Entity Recognition; modifies the document in place. */ - def recognizeNamedEntities (doc:Document): Unit + def recognizeNamedEntities(doc: Document): Unit /** Syntactic parsing; modifies the document in place. */ - def parse (doc:Document): Unit + def parse(doc:Document): Unit /** Semantic role labeling */ - def srl (doc: Document): Unit + def srl(doc: Document): Unit /** Shallow parsing; modifies the document in place. */ - def chunking (doc:Document): Unit + def chunking(doc:Document): Unit /** Coreference resolution; modifies the document in place. */ - def resolveCoreference (doc:Document): Unit + def resolveCoreference(doc:Document): Unit /** Discourse parsing; modifies the document in place. */ - def discourse (doc:Document): Unit + def discourse(doc:Document): Unit /** Relation extraction; modifies the document in place. */ def relationExtraction(doc:Document): Unit /** Annotate the given text string, specify whether to retain the text in the resultant Document. */ - def annotate (text:String, keepText:Boolean = false): Document = { - val doc = mkDocument(text, keepText) - if (doc.sentences.nonEmpty) - annotate(doc) - else - doc + def annotate(text: String, keepText: Boolean = false): Document = { + val tokenizedDoc = mkDocument(text, keepText) + val annotatedDoc = // For now, these two documents have the same type. + if (tokenizedDoc.sentences.nonEmpty) annotate(tokenizedDoc) + else tokenizedDoc + + annotatedDoc } /** Annotate the given sentences, specify whether to retain the text in the resultant Document. */ - def annotateFromSentences ( - sentences:Iterable[String], - keepText:Boolean = false): Document = { + def annotateFromSentences( + sentences: Iterable[String], + keepText: Boolean = false + ): Document = { val doc = mkDocumentFromSentences(sentences, keepText) annotate(doc) } /** Annotate the given tokens, specify whether to retain the text in the resultant Document. */ - def annotateFromTokens ( + def annotateFromTokens( sentences:Iterable[Iterable[String]], - keepText:Boolean = false): Document = { + keepText:Boolean = false + ): Document = { val doc = mkDocumentFromTokens(sentences, keepText) annotate(doc) } diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index 0465226c1..acb14b56b 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -1,21 +1,18 @@ package org.clulab.processors -import org.clulab.scala.WrappedArray._ import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree} -import org.clulab.struct.GraphMap._ import org.clulab.utils.Hash -import org.clulab.utils.SeqUtils import scala.collection.mutable /** Stores the annotations for a single sentence */ class Sentence( /** Raw tokens in this sentence; these MUST match the original text */ - val raw: Array[String], + val raw: Seq[String], /** Start character offsets for the raw tokens; start at 0 */ - val startOffsets: Array[Int], + val startOffsets: Seq[Int], /** End character offsets for the raw tokens; start at 0 */ - val endOffsets: Array[Int], + val endOffsets: Seq[Int], /** * Words produced from raw tokens, closer to what the downstream components expect @@ -24,25 +21,25 @@ class Sentence( * However, the number of raw tokens MUST always equal the number of words, so if the exact text must be recovered, * please use the raw tokens with the same positions */ - val words: Array[String]) extends Serializable { + val words: Seq[String], /** POS tags for words */ - var tags: Option[Array[String]] = None + val tags: Option[Seq[String]] = None, /** Lemmas */ - var lemmas: Option[Array[String]] = None + val lemmas: Option[Seq[String]] = None, /** NE labels */ - var entities: Option[Array[String]] = None + val entities: Option[Seq[String]] = None, /** Normalized values of named/numeric entities, such as dates */ - var norms: Option[Array[String]] = None + val norms: Option[Seq[String]] = None, /** Shallow parsing labels */ - var chunks: Option[Array[String]] = None + val chunks: Option[Seq[String]] = None, /** Constituent tree of this sentence; includes head words */ - var syntacticTree: Option[Tree] = None + val syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - var graphs: GraphMap = GraphMap() + val graphs: GraphMap.Type = GraphMap.empty, /** Relation triples from OpenIE */ - var relations:Option[Array[RelationTriple]] = None - + val relations:Option[Seq[RelationTriple]] = None +) extends Serializable { def size:Int = raw.length @@ -66,7 +63,7 @@ class Sentence( def equivalenceHash: Int = { val stringCode = "org.clulab.processors.Sentence" - def getAnnotationsHash(labelsOpt: Option[Array[_]]): Int = labelsOpt + def getAnnotationsHash(labelsOpt: Option[Seq[_]]): Int = labelsOpt .map { labels => val hs = labels.map(_.hashCode) val result = Hash.withLast(labels.length)( @@ -98,39 +95,38 @@ class Sentence( * * @return A directed graph of dependencies if any exist, otherwise None */ - def dependencies:Option[DirectedGraph[String]] = graphs match { - case collapsed if collapsed.contains(UNIVERSAL_ENHANCED) => collapsed.get(UNIVERSAL_ENHANCED) - case basic if basic.contains(UNIVERSAL_BASIC) => basic.get(UNIVERSAL_BASIC) + def dependencies: Option[DirectedGraph[String]] = graphs match { + case collapsed if collapsed.contains(GraphMap.UNIVERSAL_ENHANCED) => collapsed.get(GraphMap.UNIVERSAL_ENHANCED) + case basic if basic.contains(GraphMap.UNIVERSAL_BASIC) => basic.get(GraphMap.UNIVERSAL_BASIC) case _ => None } /** Fetches the universal basic dependencies */ - def universalBasicDependencies:Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_BASIC) + def universalBasicDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.UNIVERSAL_BASIC) /** Fetches the universal enhanced dependencies */ - def universalEnhancedDependencies:Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_ENHANCED) + def universalEnhancedDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.UNIVERSAL_ENHANCED) /** Fetches the Stanford basic dependencies */ - def stanfordBasicDependencies:Option[DirectedGraph[String]] = graphs.get(STANFORD_BASIC) + def stanfordBasicDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.STANFORD_BASIC) /** Fetches the Stanford collapsed dependencies */ - def stanfordCollapsedDependencies:Option[DirectedGraph[String]] = graphs.get(STANFORD_COLLAPSED) + def stanfordCollapsedDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.STANFORD_COLLAPSED) - def semanticRoles:Option[DirectedGraph[String]] = graphs.get(SEMANTIC_ROLES) - def enhancedSemanticRoles:Option[DirectedGraph[String]] = graphs.get(ENHANCED_SEMANTIC_ROLES) + def semanticRoles: Option[DirectedGraph[String]] = graphs.get(GraphMap.SEMANTIC_ROLES) - def hybridDependencies:Option[DirectedGraph[String]] = graphs.get(HYBRID_DEPENDENCIES) + def enhancedSemanticRoles: Option[DirectedGraph[String]] = graphs.get(GraphMap.ENHANCED_SEMANTIC_ROLES) - def setDependencies(depType: String, deps: DirectedGraph[String]): Unit = graphs += (depType -> deps) + def hybridDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.HYBRID_DEPENDENCIES) /** * Recreates the text of the sentence, preserving the original number of white spaces between tokens * * @return the text of the sentence */ - def getSentenceText:String = getSentenceFragmentText(0, words.length) + def getSentenceText: String = getSentenceFragmentText(0, words.length) - def getSentenceFragmentText(start:Int, end:Int):String = { + def getSentenceFragmentText(start: Int, end: Int):String = { // optimize the single token case if (end - start == 1) raw(start) else { @@ -149,49 +145,52 @@ class Sentence( } } - /** Reverts the current sentence */ - def revert():Sentence = { - val reverted = new Sentence( - SeqUtils.revert(raw).toArray, - SeqUtils.revert(startOffsets).toArray, - SeqUtils.revert(endOffsets).toArray, - SeqUtils.revert(words).toArray) - if(tags.nonEmpty) - reverted.tags = Some(SeqUtils.revert(tags.get).toArray) - if(lemmas.nonEmpty) - reverted.lemmas = Some(SeqUtils.revert(lemmas.get).toArray) - if(entities.nonEmpty) - reverted.entities = Some(SeqUtils.revert(entities.get).toArray) - if(norms.nonEmpty) - reverted.norms = Some(SeqUtils.revert(norms.get).toArray) - if(chunks.nonEmpty) - reverted.chunks = Some(SeqUtils.revert(chunks.get).toArray) - - // TODO: revert syntacticTree and graphs! - - reverted - } + /** Reverses the current sentence */ + def reverse(): Sentence = { + val reversedSentence = Sentence( + raw.reverse, + startOffsets.reverse, + endOffsets.reverse, + words.reverse, + tags.map(_.reverse), + lemmas.map(_.reverse), + entities.map(_.reverse), + norms.map(_.reverse), + chunks.map(_.reverse), + // TODO: revert syntacticTree and graphs! + syntacticTree, + graphs, + relations + ) - def assimilate(sentence: Sentence): Sentence = { - tags = sentence.tags - lemmas = sentence.lemmas - entities = sentence.entities - norms = sentence.norms - chunks = sentence.chunks - syntacticTree = sentence.syntacticTree - graphs = sentence.graphs - relations = sentence.relations - this + reversedSentence } - def copy(raw: Array[String] = raw, startOffsets: Array[Int] = startOffsets, endOffsets: Array[Int] = endOffsets, words: Array[String] = words): Sentence = - new Sentence(raw, startOffsets, endOffsets, words).assimilate(this) + def copy( + raw: Seq[String] = raw, + startOffsets: Seq[Int] = startOffsets, + endOffsets: Seq[Int] = endOffsets, + words: Seq[String] = words, + + tags: Option[Seq[String]] = tags, + lemmas: Option[Seq[String]] = lemmas, + entities: Option[Seq[String]] = entities, + norms: Option[Seq[String]] = norms, + chunks: Option[Seq[String]] = chunks, + syntacticTree: Option[Tree] = syntacticTree, + graphs: GraphMap.Type = graphs, + relations: Option[Seq[RelationTriple]] = relations + ): Sentence = + new Sentence( + raw, startOffsets, endOffsets, words, + tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations + ) def offset(offset: Int): Sentence = { if (offset == 0) this else { - val newStartOffsets = startOffsets.map(_ + offset).toArray - val newEndOffsets = endOffsets.map(_ + offset).toArray + val newStartOffsets = startOffsets.map(_ + offset) + val newEndOffsets = endOffsets.map(_ + offset) copy(startOffsets = newStartOffsets, endOffsets = newEndOffsets) } @@ -201,43 +200,35 @@ class Sentence( object Sentence { def apply( - raw:Array[String], - startOffsets: Array[Int], - endOffsets: Array[Int]): Sentence = + raw: Seq[String], + startOffsets: Seq[Int], + endOffsets: Seq[Int]): Sentence = new Sentence(raw, startOffsets, endOffsets, raw) // words are identical to raw tokens (a common situation) def apply( - raw:Array[String], - startOffsets: Array[Int], - endOffsets: Array[Int], - words: Array[String]): Sentence = + raw: Seq[String], + startOffsets: Seq[Int], + endOffsets: Seq[Int], + words: Seq[String]): Sentence = new Sentence(raw, startOffsets, endOffsets, words) def apply( - raw: Array[String], - startOffsets: Array[Int], - endOffsets: Array[Int], - words: Array[String], - tags: Option[Array[String]], - lemmas: Option[Array[String]], - entities: Option[Array[String]], - norms: Option[Array[String]], - chunks: Option[Array[String]], - tree: Option[Tree], - deps: GraphMap, - relations: Option[Array[RelationTriple]] + raw: Seq[String], + startOffsets: Seq[Int], + endOffsets: Seq[Int], + words: Seq[String], + tags: Option[Seq[String]], + lemmas: Option[Seq[String]], + entities: Option[Seq[String]] = None, + norms: Option[Seq[String]] = None, + chunks: Option[Seq[String]] = None, + tree: Option[Tree] = None, + deps: GraphMap.Type = GraphMap.empty, + relations: Option[Seq[RelationTriple]] = None ): Sentence = { - val s = Sentence(raw, startOffsets, endOffsets, words) - // update annotations - s.tags = tags - s.lemmas = lemmas - s.entities = entities - s.norms = norms - s.chunks = chunks - s.syntacticTree = tree - s.graphs = deps - s.relations = relations - s + new Sentence( + raw, startOffsets, endOffsets, words, + tags, lemmas, entities, norms, chunks, tree, deps, relations + ) } - -} \ No newline at end of file +} diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index b791d181e..2af5a1d3a 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -2,27 +2,28 @@ package org.clulab.processors.clu import com.typesafe.config.Config import com.typesafe.config.ConfigFactory -import org.clulab.numeric.{NumericEntityRecognizer, setLabelsAndNorms} +import org.clulab.numeric.NumericEntityRecognizer +import org.clulab.numeric.NumericUtils import org.clulab.processors.{Document, Processor, Sentence} -import org.clulab.processors.clu.tokenizer._ -import org.clulab.scala.WrappedArray._ -import org.clulab.scala_transformers.encoder.TokenClassifier +import org.clulab.processors.clu.tokenizer.Lemmatizer +import org.clulab.processors.clu.tokenizer.{EnglishLemmatizer, PortugueseLemmatizer, SpanishLemmatizer} +import org.clulab.processors.clu.tokenizer.Tokenizer +import org.clulab.processors.clu.tokenizer.{OpenDomainEnglishTokenizer, OpenDomainPortugueseTokenizer, OpenDomainSpanishTokenizer} +import org.clulab.processors.hexatagging.HexaDecoder import org.clulab.scala_transformers.encoder.EncoderMaxTokensRuntimeException +import org.clulab.scala_transformers.encoder.TokenClassifier import org.clulab.sequences.{LexiconNER, NamedEntity} import org.clulab.struct.DirectedGraph import org.clulab.struct.GraphMap import org.clulab.utils.{Configured, MathUtils, ToEnhancedDependencies} +import org.clulab.utils.WrappedArraySeq import org.slf4j.{Logger, LoggerFactory} -import org.clulab.odin.Mention - import BalaurProcessor._ -import PostProcessor._ -import org.clulab.processors.hexatagging.HexaDecoder class BalaurProcessor protected ( val config: Config, - val optionalNER: Option[LexiconNER], + val lexiconNerOpt: Option[LexiconNER], val numericEntityRecognizerOpt: Option[NumericEntityRecognizer], wordTokenizer: Tokenizer, wordLemmatizer: Lemmatizer, @@ -34,230 +35,245 @@ class BalaurProcessor protected ( // standard, abbreviated constructor def this( config: Config = ConfigFactory.load("balaurprocessor"), - optionalNER: Option[LexiconNER] = None, + lexiconNerOpt: Option[LexiconNER] = None, seasonPathOpt: Option[String] = Some("/org/clulab/numeric/SEASON.tsv") ) = this( config, - optionalNER, + lexiconNerOpt, newNumericEntityRecognizerOpt(seasonPathOpt), - mkTokenizer(BalaurProcessor.getArgString(config, s"$prefix.language", Some("EN"))), - mkLemmatizer(BalaurProcessor.getArgString(config, s"$prefix.language", Some("EN"))), + mkTokenizer(getConfigArgString(config, s"$prefix.language", Some("EN"))), + mkLemmatizer(getConfigArgString(config, s"$prefix.language", Some("EN"))), // TokenClassifier.fromFiles(config.getString(s"$prefix.modelName")) TokenClassifier.fromResources(config.getString(s"$prefix.modelName")) ) def copy( - configOpt: Option[Config] = None, - optionalNEROpt: Option[Option[LexiconNER]] = None, - numericEntityRecognizerOptOpt: Option[Option[NumericEntityRecognizer]] = None, - wordTokenizerOpt: Option[Tokenizer] = None, - wordLemmatizerOpt: Option[Lemmatizer] = None, - tokenClassifierOpt: Option[TokenClassifier] = None + config: Config = config, + lexiconNerOpt: Option[LexiconNER] = lexiconNerOpt, + numericEntityRecognizerOpt: Option[NumericEntityRecognizer] = numericEntityRecognizerOpt, + wordTokenizer: Tokenizer = wordTokenizer, + wordLemmatizer: Lemmatizer = wordLemmatizer, + tokenClassifier: TokenClassifier = tokenClassifier ): BalaurProcessor = { new BalaurProcessor( - configOpt.getOrElse(this.config), - optionalNEROpt.getOrElse(this.optionalNER), - numericEntityRecognizerOptOpt.getOrElse(this.numericEntityRecognizerOpt), - wordTokenizerOpt.getOrElse(this.wordTokenizer), - wordLemmatizerOpt.getOrElse(this.wordLemmatizer), - tokenClassifierOpt.getOrElse(this.tokenClassifier) + config, + lexiconNerOpt, + numericEntityRecognizerOpt, + wordTokenizer, + wordLemmatizer, + tokenClassifier ) } + // TODO: Try not to make a new decoder for each processor? val hexaDecoder = new HexaDecoder() override def getConf: Config = config + // TODO: Why not make the wordTokenizer a val then? + def tokenizer: Tokenizer = wordTokenizer + override def mkDocument(text: String, keepText: Boolean): Document = { DocumentMaker.mkDocument(tokenizer, text, keepText) } - def tokenizer: Tokenizer = wordTokenizer - - override def mkDocumentFromSentences(sentences: Iterable[String], + override def mkDocumentFromSentences( + sentences: Iterable[String], keepText: Boolean, - charactersBetweenSentences: Int): Document = { + charactersBetweenSentences: Int + ): Document = { DocumentMaker.mkDocumentFromSentences(tokenizer, sentences, keepText, charactersBetweenSentences) } - override def mkDocumentFromTokens(sentences: Iterable[Iterable[String]], + override def mkDocumentFromTokens( + sentences: Iterable[Iterable[String]], keepText: Boolean, charactersBetweenSentences: Int, - charactersBetweenTokens: Int): Document = { + charactersBetweenTokens: Int + ): Document = { DocumentMaker.mkDocumentFromTokens(sentences, keepText, charactersBetweenSentences, charactersBetweenSentences) } - override def tagPartsOfSpeech(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this processor!") - } - - /** Lematization; modifies the document in place */ - override def lemmatize(doc: Document): Unit = { - for(sent <- doc.sentences) { - val lemmas = new Array[String](sent.size) - for(i <- sent.words.indices) { - lemmas(i) = wordLemmatizer.lemmatizeWord(sent.words(i)) - - // a lemma may be empty in some weird Unicode situations - if(lemmas(i).isEmpty) { - logger.debug(s"""WARNING: Found empty lemma for word #$i "${sent.words(i)}" in sentence: ${sent.words.mkString(" ")}""") - lemmas(i) = sent.words(i).toLowerCase() - } - } - sent.lemmas = Some(lemmas) + override def lemmatize(words: Seq[String]): Seq[String] = { + val lemmas = words.zipWithIndex.map { case (word, index) => + val lemma = wordLemmatizer.lemmatizeWord(word) + // A lemma may be empty in some weird Unicode situations. + val nonEmptyLemma = + if (lemma.isEmpty) { + logger.debug(s"""WARNING: Found empty lemma for word #$index "$word" in sentence: ${words.mkString(" ")}""") + word.toLowerCase() + } + else lemma + + nonEmptyLemma } + + lemmas } /** Generates cheap lemmas with the word in lower case, for languages where a lemmatizer is not available */ - def cheapLemmatize(doc:Document): Unit = { - for(sent <- doc.sentences) { - val lemmas = sent.words.map(_.toLowerCase()).toArray - sent.lemmas = Some(lemmas) - } - } + def cheapLemmatize(sentence: Sentence): Seq[String] = + sentence.words.map(_.toLowerCase()) - override def recognizeNamedEntities(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this procecessor!") - } + // TODO: Just don't include anything that calls this. + def throwCannotCallException(methodName: String): Unit = + throw new RuntimeException(s"ERROR: cannot call $methodName on its own in this processor!") - override def parse(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this procecessor!") - } + override def tagPartsOfSpeech(doc: Document): Unit = throwCannotCallException("tagPartsOfSpeech") - override def srl(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + override def recognizeNamedEntities(doc: Document): Unit = throwCannotCallException("recognizeNamedEntities") - override def chunking(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this procecessor!") - } + override def parse(doc: Document): Unit = throwCannotCallException("parse") - override def resolveCoreference(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + override def chunking(doc: Document): Unit = throwCannotCallException("chunking") - override def discourse(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + def throwNotSupportedException(methodName: String): Unit = + throw new RuntimeException(s"ERROR: $methodName functionality not supported in this procecessor!") - override def relationExtraction(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + override def srl(doc: Document): Unit = throwNotSupportedException("srl") - override def annotate(doc: Document): Document = { - val verbose = false + override def resolveCoreference(doc: Document): Unit = throwNotSupportedException("resolveCoreference") + + override def discourse(doc: Document): Unit = throwNotSupportedException("discourse") - // lemmas are created deterministically, not through the MTL framework - lemmatize(doc) + override def relationExtraction(doc: Document): Unit = throwNotSupportedException("relationExtraction") + + override def annotate(doc: Document): Document = { + // Process one sentence at a time through the MTL framework. + val partlyAnnotatedSentences = doc.sentences.map { sentence => + val words = sentence.words + // Lemmas are created deterministically, not through the MTL framework. + val lemmas = lemmatize(words) - // process one sentence at a time through the MTL framework - for (sent <- doc.sentences) { try { - val allLabelsAndScores = tokenClassifier.predictWithScores(sent.words) - assignPosTags(allLabelsAndScores(TASK_TO_INDEX(POS_TASK)), sent) - assignNamedEntityLabels(allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), sent) - assignChunkLabels(allLabelsAndScores(TASK_TO_INDEX(CHUNKING_TASK)), sent) - assignDependencyLabelsUsingHexaTags( + val allLabelsAndScores = tokenClassifier.predictWithScores(words) + val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK))) + val entities = { + val optionalEntities = mkNerLabelsOpt(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas) + + mkNamedEntityLabels(words, allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), optionalEntities) + } + val chunks = mkChunkLabels(words, allLabelsAndScores(TASK_TO_INDEX(CHUNKING_TASK))) + val graphs = mkDependencyLabelsUsingHexaTags( + words, lemmas, tags, allLabelsAndScores(TASK_TO_INDEX(HEXA_TERM_TASK)), - allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK)), - sent + allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK)) + ) + // Entities and norms need to still be patched and filled in, so this is only a partly annotated sentence. + val partlyAnnotatedSentence = sentence.copy( + tags = Some(tags), lemmas = Some(lemmas), entities = Some(entities), chunks = Some(chunks), graphs = graphs ) - } catch { - case e: EncoderMaxTokensRuntimeException => - // this sentence exceeds the maximum number of tokens for the encoder - // TODO: at some point do something smart here - println(s"ERROR: this sentence exceeds the maximum number of tokens for the encoder and will not be annotated: ${sent.words.mkString(" ")}") + partlyAnnotatedSentence + } + // TODO: Improve error handling. + catch { + // No values, not even lemmas, will be included in the annotation is there was an exception. + case e: EncoderMaxTokensRuntimeException => + // TODO: at some point do something smart here + println(s"ERROR: This sentence exceeds the maximum number of tokens for the encoder and will not be annotated: ${sentence.words.mkString(" ")}") + sentence + case e: AssertionError => + println(s"ERROR: The output of predictWithScores does not satisfy assertions. The sentence will not be annotated: ${sentence.words.mkString(" ")}") + sentence } } + val partlyAnnotatedDocument = doc.copy(sentences = partlyAnnotatedSentences) + val fullyAnnotatedDocument = numericEntityRecognizerOpt.map { numericEntityRecognizer => + val numericMentions = numericEntityRecognizer.extractFrom(partlyAnnotatedDocument) + val (newLabels, newNorms) = NumericUtils.mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions) + val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index => + partlyAnnotatedDocument.sentences(index).copy( + entities = Some(newLabels(index)), + norms = Some(newNorms(index)) + ) + } - // numeric entities using our numeric entity recognizer based on Odin rules - if(numericEntityRecognizerOpt.nonEmpty) { - val numericMentions = extractNumericEntityMentions(doc) - setLabelsAndNorms(doc, numericMentions) - } + partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences) + }.getOrElse(partlyAnnotatedDocument) - doc + fullyAnnotatedDocument } - def extractNumericEntityMentions(doc:Document): Seq[Mention] = { - numericEntityRecognizerOpt.get.extractFrom(doc) - } + private def mkPosTags(words: Seq[String], labels: Array[Array[(String, Float)]]): Seq[String] = { + assert(labels.length == words.length) - private def assignPosTags(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { - assert(labels.length == sent.words.length) - sent.tags = Some(postprocessPartOfSpeechTags(sent.words, labels.map(_.head._1).toArray)) - } + val rawTags = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq + val cookedTags = PostProcessor.postprocessPartOfSpeechTags(words, rawTags) - /** Must be called after assignPosTags and lemmatize because it requires Sentence.tags and Sentence.lemmas */ - private def assignNamedEntityLabels(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { - assert(labels.length == sent.words.length) + cookedTags + } - // NER labels from the custom NER - val optionalNERLabels: Option[Array[String]] = optionalNER.map { ner => + private def mkNerLabelsOpt( + words: Seq[String], startOffsets: Seq[Int], endOffsets: Seq[Int], + tags: Seq[String], lemmas: Seq[String] + ): Option[Seq[String]] = { + lexiconNerOpt.map { lexiconNer => val sentence = Sentence( - sent.words, - sent.startOffsets, - sent.endOffsets, - sent.words, - sent.tags, - sent.lemmas, - entities = None, - norms = None, - chunks = None, - tree = None, - deps = EMPTY_GRAPH, - relations = None + words, // TODO: Why isn't this raw? + startOffsets, + endOffsets, + words, + Some(tags), + Some(lemmas) ) - ner.find(sentence) + lexiconNer.find(sentence) } + } - val genericLabels = NamedEntity.patch(labels.map(_.head._1).toArray) + /** Must be called after assignPosTags and lemmatize because it requires Sentence.tags and Sentence.lemmas */ + private def mkNamedEntityLabels(words: Seq[String], labels: Array[Array[(String, Float)]], nerLabelsOpt: Option[Seq[String]]): Seq[String] = { + assert(labels.length == words.length) - if(optionalNERLabels.isEmpty) { - sent.entities = Some(genericLabels) - } else { + val labelsSeq = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq + val genericLabels = NamedEntity.patch(labelsSeq) + val specificLabels = nerLabelsOpt.map { nerLabels => //println(s"MERGING NE labels for sentence: ${sent.words.mkString(" ")}") //println(s"Generic labels: ${NamedEntity.patch(labels).mkString(", ")}") //println(s"Optional labels: ${optionalNERLabels.get.mkString(", ")}") - val mergedLabels = NamedEntity.patch(mergeNerLabels(genericLabels, optionalNERLabels.get)) + val mergedLabels = mergeNerLabels(genericLabels, nerLabels) + val patchedLabels = NamedEntity.patch(mergedLabels) //println(s"Merged labels: ${mergedLabels.mkString(", ")}") - sent.entities = Some(mergedLabels) - } + + patchedLabels + }.getOrElse(genericLabels) + + specificLabels } - private def mergeNerLabels(generic: Array[String], custom: Array[String]): Array[String] = { + private def mergeNerLabels(generic: Seq[String], custom: Seq[String]): Seq[String] = { require(generic.length == custom.length) val customNamedEntities = NamedEntity.collect(custom) - val result = generic.toArray // A copy of the generic labels is created here. if (customNamedEntities.isEmpty) - result + generic else { - val genericNamedEntities = NamedEntity.collect(generic) - //println(s"Generic NamedEntity: ${genericNamedEntities.mkString(", ")}") //println(s"Custom NamedEntity: ${customNamedEntities.mkString(", ")}") + val genericNamedEntities = NamedEntity.collect(generic) + val combinedNamedEntities = NamedEntity.combine(generic, genericNamedEntities, customNamedEntities) - // The custom labels override the generic ones! - NamedEntity.combine(result, genericNamedEntities, customNamedEntities) + combinedNamedEntities } } - private def assignChunkLabels(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { - assert(labels.length == sent.words.length) - sent.chunks = Some(labels.map(_.head._1).toArray) + private def mkChunkLabels(words: Seq[String], labels: Array[Array[(String, Float)]]): Seq[String] = { + assert(labels.length == words.length) + + WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq } + // TODO: This appears to be unused. // The head has one score, the label has another. Here the two scores are interpolated // and the head and label are stored together in a single object with the score if the // object, the Dependency, has a valid absolute head. private def interpolateHeadsAndLabels( - sentHeadPredictionScores: Array[Array[PredictionScore]], - sentLabelPredictionScores: Array[Array[PredictionScore]], - lambda: Float): Array[Array[Dependency]] = { + sentHeadPredictionScores: Array[Array[PredictionScore]], + sentLabelPredictionScores: Array[Array[PredictionScore]], + lambda: Float + ): Array[Array[Dependency]] = { assert(sentHeadPredictionScores.length == sentLabelPredictionScores.length) val sentDependencies = sentHeadPredictionScores.zip(sentLabelPredictionScores).zipWithIndex.map { case ((wordHeadPredictionScores, wordLabelPredictionScores), wordIndex) => @@ -286,86 +302,77 @@ class BalaurProcessor protected ( sentDependencies.toArray } - private def assignDependencyLabelsUsingHexaTags( + private def mkDependencyLabelsUsingHexaTags( + words: Seq[String], lemmas: Seq[String], tags: Seq[String], termTags: Array[Array[PredictionScore]], - nonTermTags: Array[Array[PredictionScore]], - sent: Sentence): Unit = { + nonTermTags: Array[Array[PredictionScore]] + ): GraphMap.Type = { val verbose = false - + val size = words.length // bht is used just for debugging purposes here val (bht, deps, roots) = hexaDecoder.decode(termTags, nonTermTags, topK = 25, verbose) - if(verbose && bht.nonEmpty) { + + if (verbose && bht.nonEmpty) { println(bht) println(s"Dependencies (${deps.get.size}):") println(deps.mkString("\n")) println("Roots: " + roots.get.mkString(", ")) } - - if(deps.nonEmpty && roots.nonEmpty) { + if (deps.nonEmpty && roots.nonEmpty) { // basic dependencies that replicate treebank annotations - val depGraph = new DirectedGraph[String](deps.get, Some(sent.size), roots) - sent.graphs += GraphMap.UNIVERSAL_BASIC -> depGraph - + val depGraph = new DirectedGraph[String](deps.get, Some(size), roots) // enhanced dependencies as defined by Manning - val enhancedDepGraph = ToEnhancedDependencies.generateUniversalEnhancedDependencies(sent, depGraph) - sent.graphs += GraphMap.UNIVERSAL_ENHANCED -> enhancedDepGraph - - // ideally, hybrid dependencies should contain both syntactic dependencies and semantic roles - // however, this processor produces only syntactic dependencies - sent.graphs += GraphMap.HYBRID_DEPENDENCIES -> enhancedDepGraph + val enhancedDepGraph = ToEnhancedDependencies.generateUniversalEnhancedDependencies(words, lemmas, tags, depGraph) + + Map( + GraphMap.UNIVERSAL_BASIC -> depGraph, + GraphMap.UNIVERSAL_ENHANCED -> enhancedDepGraph, + // ideally, hybrid dependencies should contain both syntactic dependencies and semantic roles + // however, this processor produces only syntactic dependencies + GraphMap.HYBRID_DEPENDENCIES -> enhancedDepGraph + ) } - } + else + GraphMap.empty + } } object BalaurProcessor { - val logger:Logger = LoggerFactory.getLogger(classOf[BalaurProcessor]) - val prefix:String = "BalaurProcessor" - - val OUTSIDE = "O" - val EMPTY_GRAPH = GraphMap() + val logger: Logger = LoggerFactory.getLogger(classOf[BalaurProcessor]) + val prefix: String = "BalaurProcessor" val NER_TASK = "NER" val POS_TASK = "POS" val CHUNKING_TASK = "Chunking" - val DEPS_HEAD_TASK = "Deps Head" - val DEPS_LABEL_TASK = "Deps Label" val HEXA_TERM_TASK = "Hexa Term" val HEXA_NONTERM_TASK = "Hexa NonTerm" - val PARSING_INTERPOLATION_LAMBDA = 0.6f - val PARSING_TOPK = 5 - // maps a task name to a head index in the encoder - val TASK_TO_INDEX = Map( - NER_TASK -> 0, - POS_TASK -> 1, - CHUNKING_TASK -> 2, - HEXA_TERM_TASK -> 3, - HEXA_NONTERM_TASK -> 4 - ) - - def mkTokenizer(lang: String): Tokenizer = { - lang match { - case "PT" => new OpenDomainPortugueseTokenizer - case "ES" => new OpenDomainSpanishTokenizer - case _ => new OpenDomainEnglishTokenizer - } + val TASK_TO_INDEX: Map[String, Int] = Seq( + NER_TASK, + POS_TASK, + CHUNKING_TASK, + HEXA_TERM_TASK, + HEXA_NONTERM_TASK + ).zipWithIndex.toMap + + def mkTokenizer(lang: String): Tokenizer = lang match { + case "PT" => new OpenDomainPortugueseTokenizer + case "ES" => new OpenDomainSpanishTokenizer + case "EN" | _ => new OpenDomainEnglishTokenizer } - def mkLemmatizer(lang: String): Lemmatizer = { - lang match { - case "PT" => new PortugueseLemmatizer - case "ES" => new SpanishLemmatizer - case _ => new EnglishLemmatizer - } + def mkLemmatizer(lang: String): Lemmatizer = lang match { + case "PT" => new PortugueseLemmatizer + case "ES" => new SpanishLemmatizer + case "EN" | _ => new EnglishLemmatizer } - def getArgString (config: Config, argPath: String, defaultValue: Option[String]): String = - if (config.hasPath(argPath)) config.getString(argPath) - else if(defaultValue.nonEmpty) defaultValue.get - else throw new RuntimeException(s"ERROR: parameter $argPath must be defined!") + def getConfigArgString (config: Config, argPath: String, defaultValue: Option[String]): String = + if (config.hasPath(argPath)) config.getString(argPath) + else if (defaultValue.nonEmpty) defaultValue.get + else throw new RuntimeException(s"ERROR: parameter $argPath must be defined!") - def newNumericEntityRecognizerOpt(seasonPathOpt: Option[String]): Option[NumericEntityRecognizer] = { - seasonPathOpt.map(NumericEntityRecognizer(_)) - } + def newNumericEntityRecognizerOpt(seasonPathOpt: Option[String]): Option[NumericEntityRecognizer] = + seasonPathOpt.map(NumericEntityRecognizer(_)) } diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index 92168b4bd..0a303701e 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -1,90 +1,100 @@ package org.clulab.processors.clu -import org.slf4j.LoggerFactory -import org.slf4j.Logger -import org.clulab.processors.clu.tokenizer.Tokenizer import org.clulab.processors.Document -import scala.collection.mutable.ArrayBuffer import org.clulab.processors.Sentence +import org.clulab.processors.clu.tokenizer.Tokenizer +import org.clulab.utils.WrappedArraySeq +import org.slf4j.Logger +import org.slf4j.LoggerFactory + +import scala.collection.compat._ +import scala.collection.mutable.ArrayBuffer class DocumentMaker object DocumentMaker { - val logger:Logger = LoggerFactory.getLogger(classOf[DocumentMaker]) + val logger: Logger = LoggerFactory.getLogger(classOf[DocumentMaker]) /** Constructs a document of tokens from free text; includes sentence splitting and tokenization */ - def mkDocument(tokenizer:Tokenizer, - text:String, - keepText:Boolean): Document = { - val sents = tokenizer.tokenize(text) - val doc = new Document(sents) - if(keepText) doc.text = Some(text) - doc + def mkDocument( // TODO: mkDocumentFromText + tokenizer: Tokenizer, + text: String, + keepText: Boolean + ): Document = { + val sentences = tokenizer.tokenize(text) + val textOpt = Option.when(keepText)(text) + val document = Document(sentences, textOpt) + + document } /** Constructs a document of tokens from an array of untokenized sentences */ - def mkDocumentFromSentences(tokenizer:Tokenizer, - sentences:Iterable[String], - keepText:Boolean, - charactersBetweenSentences:Int): Document = { - val sents = new ArrayBuffer[Sentence]() + def mkDocumentFromSentences( // TODO: mkDocumentFromTexts + tokenizer: Tokenizer, + texts: Iterable[String], + keepText: Boolean, + charactersBetweenSentences: Int + ): Document = { + val sentenceSep = " " * charactersBetweenSentences var characterOffset = 0 - for(text <- sentences) { - val sent = tokenizer.tokenize(text, sentenceSplit = false).head // we produce a single sentence here! - - // update character offsets between sentences - for(i <- 0 until sent.size) { - sent.startOffsets(i) += characterOffset - sent.endOffsets(i) += characterOffset - } + val sentencesArray = texts.map { text => + val sentence = tokenizer.tokenize(text, sentenceSplit = false, characterOffset).head // We produce a single sentence here! - // move the character offset after the current sentence - characterOffset = sent.endOffsets.last + charactersBetweenSentences + characterOffset = sentence.endOffsets.last + charactersBetweenSentences + sentence + }.toArray + val sentences = WrappedArraySeq(sentencesArray).toImmutableSeq + val textOpt = Option.when(keepText)(texts.mkString(sentenceSep)) + val document = Document(sentences, textOpt) - //println("SENTENCE: " + sent.words.mkString(", ")) - //println("Start offsets: " + sent.startOffsets.mkString(", ")) - //println("End offsets: " + sent.endOffsets.mkString(", ")) - sents += sent - } - val doc = new Document(sents.toArray) - if(keepText) doc.text = Some(sentences.mkString(mkSep(charactersBetweenSentences))) - doc + document } /** Constructs a document of tokens from an array of tokenized sentences */ - def mkDocumentFromTokens(sentences:Iterable[Iterable[String]], - keepText:Boolean, - charactersBetweenSentences:Int, - charactersBetweenTokens:Int): Document = { + def mkDocumentFromTokens( // TODO: mkDocumentFromTokenizedTexts + tokenizedTexts: Iterable[Iterable[String]], + keepText: Boolean, + charactersBetweenSentences: Int, + charactersBetweenTokens: Int + ): Document = { + val sentenceSep = " " * charactersBetweenSentences + val tokenSep = " " * charactersBetweenTokens var charOffset = 0 - var sents = new ArrayBuffer[Sentence]() val text = new StringBuilder - for(sentence <- sentences) { - val startOffsets = new ArrayBuffer[Int]() - val endOffsets = new ArrayBuffer[Int]() - for(word <- sentence) { - startOffsets += charOffset - charOffset += word.length - endOffsets += charOffset + // Just use one buffer for each but clear them as necessary. + val startOffsetsBuffer = new ArrayBuffer[Int]() + val endOffsetsBuffer = new ArrayBuffer[Int]() + val sentencesArray = tokenizedTexts.map { tokenizedTextIterable => + // We are going to need to tokens in an array anyway, so make them now. + val tokenizedTextArray = tokenizedTextIterable.toArray + + tokenizedTextArray.foreach { token => + startOffsetsBuffer += charOffset + charOffset += token.length + endOffsetsBuffer += charOffset charOffset += charactersBetweenTokens } - // note: NO postprocessing happens in this case, so use it carefully! - sents += new Sentence(sentence.toArray, startOffsets.toArray, endOffsets.toArray, sentence.toArray) - charOffset += charactersBetweenSentences - charactersBetweenTokens - if(keepText) { - text.append(sentence.mkString(mkSep(charactersBetweenTokens))) - text.append(mkSep(charactersBetweenSentences)) - } - } + // The simple version of this doesn't work if there were no tokens. + charOffset += charactersBetweenSentences - (if (tokenizedTextArray.nonEmpty) charactersBetweenTokens else 0) - val doc = new Document(sents.toArray) - if(keepText) doc.text = Some(text.toString) - doc - } + // Note: NO postprocessing happens in this case, so use it carefully! + val startOffsets = WrappedArraySeq(startOffsetsBuffer.toArray).toImmutableSeq + startOffsetsBuffer.clear() + val endOffsets = WrappedArraySeq(endOffsetsBuffer.toArray).toImmutableSeq + endOffsetsBuffer.clear() + val tokens = WrappedArraySeq(tokenizedTextArray).toImmutableSeq + val sentence = new Sentence(tokens, startOffsets, endOffsets, tokens) + + if (keepText) { + text.append(tokens.mkString(tokenSep)) + text.append(sentenceSep) + } + sentence + }.toArray + val sentences = WrappedArraySeq(sentencesArray).toImmutableSeq + val textOpt = Option.when(keepText)(text.toString) + val document = Document(sentences, textOpt) - private def mkSep(size:Int):String = { - val os = new StringBuilder - for (_ <- 0 until size) os.append(" ") - os.toString() + document } } diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentPrinter.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentPrinter.scala new file mode 100644 index 000000000..22c9845c7 --- /dev/null +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentPrinter.scala @@ -0,0 +1,91 @@ +package org.clulab.processors.clu + +import org.clulab.processors.Document +import org.clulab.struct.DirectedGraphEdgeIterator + +import java.io.PrintWriter + +trait DocumentPrinter { + def print(document: Document): Unit +} + +class DocumentPrettyPrinter(printWriter: PrintWriter) extends DocumentPrinter { + + def println(string: String): Unit = printWriter.println(string) + + def print(document: Document): Unit = { + // let's print the sentence-level annotations + document.sentences.zipWithIndex.foreach { case (sentence, sentenceCount) => + println("Sentence #" + sentenceCount + ":") + println("Tokens: " + sentence.words.zipWithIndex.mkString(" ")) + println("Start character offsets: " + sentence.startOffsets.mkString(" ")) + println("End character offsets: " + sentence.endOffsets.mkString(" ")) + + // these annotations are optional, so they are stored using Option objects, hence the foreach statement + sentence.lemmas.foreach(lemmas => println(s"Lemmas: ${lemmas.mkString(" ")}")) + sentence.tags.foreach(tags => println(s"POS tags: ${tags.mkString(" ")}")) + sentence.chunks.foreach(chunks => println(s"Chunks: ${chunks.mkString(" ")}")) + sentence.entities.foreach(entities => println(s"Named entities: ${entities.mkString(" ")}")) + sentence.norms.foreach(norms => println(s"Normalized entities: ${norms.mkString(" ")}")) + sentence.universalBasicDependencies.foreach(dependencies => { + println("Basic syntactic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.universalEnhancedDependencies.foreach(dependencies => { + println("Enhanced syntactic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.semanticRoles.foreach(dependencies => { + println("Semantic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.enhancedSemanticRoles.foreach(dependencies => { + println("Enhanced semantic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.syntacticTree.foreach(tree => { + println("Constituent tree: " + tree.toStringDepth(showHead = false)) + // see the org.clulab.struct.Tree class for more information + // on syntactic trees, including access to head phrases/words + }) + + println("\n") + } + + // let's print the coreference chains + document.coreferenceChains.foreach(chains => { + for (chain <- chains.getChains) { + println("Found one coreference chain containing the following mentions:") + for (mention <- chain) { + // note that all these offsets start at 0 too + println("\tsentenceIndex:" + mention.sentenceIndex + + " headIndex:" + mention.headIndex + + " startTokenOffset:" + mention.startOffset + + " endTokenOffset:" + mention.endOffset + + " text: " + document.sentences(mention.sentenceIndex).words.slice(mention.startOffset, mention.endOffset).mkString("[", " ", "]")) + } + } + }) + printWriter.flush() + } +} diff --git a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala index 8de6a5be2..2226e4642 100644 --- a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala @@ -1,9 +1,7 @@ package org.clulab.processors.clu -import org.clulab.processors.Sentence - import java.util.regex.Pattern -import org.clulab.struct.Edge +import scala.collection.mutable object PostProcessor { // @@ -14,35 +12,32 @@ object PostProcessor { // Matches agricultural season short hands such as "2021DS" or "2021WS" val WET_OR_DRY_SEASON = Pattern.compile("""(?i)[0-9]+(ds|ws)""") - /** POS tag corrections, in place */ - def postprocessPartOfSpeechTags(words: Array[String], tags: Array[String]): Array[String] = { - - // unigram patterns - words.indices.foreach { index => - if (tags(index) != "CC" && VERSUS_PATTERN.matcher(words(index)).matches) { - tags(index) = "CC" // "versus" seems like a CC to me. but maybe not... + /** POS tag corrections */ + def postprocessPartOfSpeechTags(words: Seq[String], tags: Seq[String]): Seq[String] = { + val newTags = words.indices.map { index => + val word = words(index) + val oldTag = tags(index) + val newTag = { + // unigram patterns + if (VERSUS_PATTERN.matcher(word).matches) + "CC" // "versus" seems like a CC to me. but maybe not... + else if (WET_OR_DRY_SEASON.matcher(word).matches) + "CD" // such years should be CDs because our grammars expect it + // bigram patterns + else if (word.equalsIgnoreCase("due")) { + if (words.lift(index + 1).map(_.toLowerCase).contains("to")) "IN" // "due" in "due to" must be a preposition + else oldTag + } + else if (word.equalsIgnoreCase("fall")) { + if (tags.lift(index + 1).contains("CD")) "NN" // "fall" followed by a CD must be NN + else oldTag + } + else oldTag } - if(WET_OR_DRY_SEASON.matcher(words(index)).matches) { - tags(index) = "CD" // such years should be CDs because our grammars expect it - } + newTag } - // bigram patterns - words.indices.dropRight(1).foreach { curr => - val next = curr + 1 - // "due" in "due to" must be a preposition - if (words(curr).equalsIgnoreCase("due") && words(next).equalsIgnoreCase("to")) { - tags(curr) = "IN" - } - - // "fall" followed by a CD must be NN - else if(words(curr).equalsIgnoreCase("fall") && tags(next).equals("CD")) { - tags(curr) = "NN" - } - } - - tags + newTags } - } diff --git a/library/src/main/scala/org/clulab/processors/clu/Veil.scala b/library/src/main/scala/org/clulab/processors/clu/Veil.scala index 6e4494ca4..31d25ed9c 100644 --- a/library/src/main/scala/org/clulab/processors/clu/Veil.scala +++ b/library/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -2,7 +2,7 @@ package org.clulab.processors.clu import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.struct.{DirectedGraph, Edge, GraphMap, RelationTriple, Tree} -import org.clulab.struct.GraphMap._ +import org.clulab.utils.WrappedArraySeq import scala.collection.mutable.{Set => MutableSet} @@ -48,7 +48,7 @@ class VeiledText(originalText: String, veiledLetters: Seq[Range]) extends Veil { } protected def unveilDocument(veiledDocument: Document): Document = { - val unveiledDocument = veiledDocument.copy(textOpt = Some(originalText)) + val unveiledDocument = veiledDocument.copy(text = Some(originalText)) unveiledDocument } @@ -109,7 +109,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) } protected lazy val veiledDocument = { val veiledSentences = originalDocument.sentences.zipWithIndex.map { case (originalSentence, sentenceIndex) => - val wordIndexes = originalSentence.words.indices.filterNot(veilSets(sentenceIndex)).toArray + val wordIndexes = originalSentence.words.indices.filterNot(veilSets(sentenceIndex)) val veiledRaw = wordIndexes.map(originalSentence.raw) val veiledStartOffsets = wordIndexes.map(originalSentence.startOffsets) val veiledEndOffsets = wordIndexes.map(originalSentence.endOffsets) @@ -122,7 +122,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) originalDocument.copy(veiledSentences) } - def unveilStringArray(veiledArrayOpt: Option[Array[String]], sentenceIndex: Int, veil: String): Option[Array[String]] = { + def unveilStringArray(veiledArrayOpt: Option[Seq[String]], sentenceIndex: Int, veil: String): Option[Seq[String]] = { val unveilArray = unveilArrays(sentenceIndex) val originalLength = originalDocument.sentences(sentenceIndex).words.length @@ -132,22 +132,20 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) veiledArray.zipWithIndex.foreach { case (veiledString, veiledIndex) => unveiledArray(unveilArray(veiledIndex)) = veiledString } - unveiledArray + WrappedArraySeq(unveiledArray).toImmutableSeq } } - def unveilGraphs(veiledGraphs: GraphMap, sentenceIndex: Int): GraphMap = { + def unveilGraphs(veiledGraphs: GraphMap.Type, sentenceIndex: Int): GraphMap.Type = { val unveilArray = unveilArrays(sentenceIndex) - val unveiledGraphs = GraphMap() val originalLength = originalDocument.sentences(sentenceIndex).words.length - - veiledGraphs.foreach { case (name, veiledDirectedGraph) => + val unveiledGraphs = veiledGraphs.map { case (name, veiledDirectedGraph) => val unveiledEdges = veiledDirectedGraph.allEdges.map { case (veiledSource, veiledDestination, relation) => Edge(unveilArray(veiledSource), unveilArray(veiledDestination), relation) } val unveiledRoots = veiledDirectedGraph.roots.map(unveilArray) - unveiledGraphs(name) = new DirectedGraph(unveiledEdges, Some(originalLength), Some(unveiledRoots)) + name -> new DirectedGraph(unveiledEdges, Some(originalLength), Some(unveiledRoots)) } unveiledGraphs } @@ -156,7 +154,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) def unveilSyntacticTree(syntacticTreeOpt: Option[Tree]): Option[Tree] = syntacticTreeOpt // TODO - def unveilRelations(relations: Option[Array[RelationTriple]]): Option[Array[RelationTriple]] = relations + def unveilRelations(relations: Option[Seq[RelationTriple]]): Option[Seq[RelationTriple]] = relations protected def unveilSentence(veiledSentence: Sentence, sentenceIndex: Int): Sentence = { val originalSentence = originalDocument.sentences(sentenceIndex) @@ -164,21 +162,27 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) val unveiledStartOffsets = originalSentence.startOffsets val unveiledEndOffsets = originalSentence.endOffsets val unveiledWords = originalSentence.words + val unveiledSentence = veiledSentence.copy(unveiledRaw, unveiledStartOffsets, unveiledEndOffsets, unveiledWords) - def unveilStringArray(veiledArrayOpt: Option[Array[String]], veil: String): Option[Array[String]] = + def unveilStringArray(veiledArrayOpt: Option[Seq[String]], veil: String): Option[Seq[String]] = this.unveilStringArray(veiledArrayOpt, sentenceIndex, veil) - unveiledSentence.tags = unveilStringArray(unveiledSentence.tags, Veil.veiledTag) - unveiledSentence.lemmas = unveilStringArray(unveiledSentence.lemmas, Veil.veiledLemma) - unveiledSentence.entities = unveilStringArray(unveiledSentence.entities, Veil.veiledEntity) - unveiledSentence.norms = unveilStringArray(unveiledSentence.norms, Veil.veiledNorm) - unveiledSentence.chunks = unveilStringArray(unveiledSentence.chunks, Veil.veiledChunk) - - unveiledSentence.syntacticTree = unveilSyntacticTree(unveiledSentence.syntacticTree) - unveiledSentence.graphs = unveilGraphs(unveiledSentence.graphs, sentenceIndex) - unveiledSentence.relations = unveilRelations(unveiledSentence.relations) - unveiledSentence + val tags = unveilStringArray(unveiledSentence.tags, Veil.veiledTag) + val lemmas = unveilStringArray(unveiledSentence.lemmas, Veil.veiledLemma) + val entities = unveilStringArray(unveiledSentence.entities, Veil.veiledEntity) + val norms = unveilStringArray(unveiledSentence.norms, Veil.veiledNorm) + val chunks = unveilStringArray(unveiledSentence.chunks, Veil.veiledChunk) + + val syntacticTree = unveilSyntacticTree(unveiledSentence.syntacticTree) + val graphs = unveilGraphs(unveiledSentence.graphs, sentenceIndex) + val relations = unveilRelations(unveiledSentence.relations) + + val newSentence = Sentence( + unveiledSentence.raw, unveiledSentence.startOffsets, unveiledSentence.endOffsets, unveiledSentence.words, + tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations + ) + newSentence } protected def unveilDocument(veiledDocument: Document): Document = { diff --git a/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala b/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala index f644da4f0..8a4790246 100644 --- a/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala +++ b/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala @@ -1,8 +1,10 @@ package org.clulab.processors.clu.tokenizer import org.clulab.processors.Sentence +import org.clulab.scala.WrappedArrayBuffer._ import java.io.{BufferedReader, InputStreamReader} +import scala.collection.compat._ import scala.collection.mutable.ArrayBuffer import scala.util.matching.Regex import scala.util.Using @@ -10,15 +12,17 @@ import scala.util.Using import SentenceSplitter._ trait SentenceSplitter { - def split(tokens:Array[RawToken], sentenceSplit:Boolean):Array[Sentence] + def split(tokens:Array[RawToken], sentenceSplit:Boolean, characterOffset: Int = 0):Seq[Sentence] } abstract class RuleBasedSentenceSplitter extends SentenceSplitter { /** * Sentence splitting over a stream of tokens - * This includes detection of abbreviations as well + * This includes detection of abbreviations as well. + * The characterOffset is included so that Sentences + * in a longer text need not be edited afterward. **/ - override def split(tokens:Array[RawToken], sentenceSplit:Boolean):Array[Sentence] = { + override def split(tokens: Array[RawToken], sentenceSplit: Boolean, characterOffset: Int): Seq[Sentence] = { val sentences = new ArrayBuffer[Sentence]() var raw = new ArrayBuffer[String]() var words = new ArrayBuffer[String]() @@ -26,49 +30,46 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { var endPositions = new ArrayBuffer[Int]() for (i <- tokens.indices) { - val crt = tokens(i) - + val curr: RawToken = tokens(i) // next and previous tokens. We need these to detect proper ends of sentences - var next: Option[RawToken] = None - if (i < tokens.length - 1) next = Some(tokens(i + 1)) - var prev: Option[RawToken] = None - if (i > 0) prev = Some(tokens(i - 1)) + val nextOpt: Option[RawToken] = Option.when(i < tokens.length - 1)(tokens(i + 1)) + val prevOpt: Option[RawToken] = Option.when(i > 0)(tokens(i - 1)) // // we handle end-of-sentence markers (periods, etc.) here // this includes detecting if a period belongs to the previous token (if it's an abbreviation) // and understanding if this token actually marks the end of a sentence // - if (EOS.findFirstIn(crt.word).isDefined) { + if (EOS.findFirstIn(curr.word).isDefined) { // found a token that normally indicates end of sentence var isEos = sentenceSplit // period that probably belongs to an abbreviation and should not be marked as EOS - if (crt.word == "." && prev.isDefined && isAbbreviation(prev.get.word) && crt.beginPosition == prev.get.endPosition) { + if (curr.word == "." && prevOpt.isDefined && isAbbreviation(prevOpt.get.word) && curr.beginPosition == prevOpt.get.endPosition) { // found a period that should be attached to the previous abbreviation - endPositions(endPositions.size - 1) = crt.endPosition - words(words.size - 1) = words.last + crt.word - raw(raw.size - 1) = raw.last + crt.raw + endPositions(endPositions.size - 1) = curr.endPosition + characterOffset + words(words.size - 1) = words.last + curr.word + raw(raw.size - 1) = raw.last + curr.raw // this is not an end of sentence if the next token does NOT look like the start of a sentence // TODO: maybe this should be handled with a binary classifier instead? - if (isEos && next.isDefined && !isSentStart(next.get.word)) { + if (isEos && nextOpt.isDefined && !isSentStart(nextOpt.get.word)) { isEos = false } } // regular end-of-sentence marker; treat is a distinct token else { - raw += crt.raw - words += crt.word - beginPositions += crt.beginPosition - endPositions += crt.endPosition + raw += curr.raw + words += curr.word + beginPositions += curr.beginPosition + characterOffset + endPositions += curr.endPosition + characterOffset } // found a valid end of sentence; start an empty one if (isEos) { - sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray) - raw = new ArrayBuffer[String]() + sentences += Sentence(raw, beginPositions, endPositions, words) + raw = new ArrayBuffer[String]() // TODO: Check whether clear() is sufficient. words = new ArrayBuffer[String]() beginPositions = new ArrayBuffer[Int]() endPositions = new ArrayBuffer[Int]() @@ -76,27 +77,27 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { } // found a period *inside* a token; sometimes this is an EOS - else if(EOS_FOLLOWEDBY_BULLET.findFirstIn(crt.raw).isDefined && - crt.raw.lastIndexOf('.') > 0 && - next.isDefined && isSentStart(next.get.word)) { + else if(EOS_FOLLOWEDBY_BULLET.findFirstIn(curr.raw).isDefined && + curr.raw.lastIndexOf('.') > 0 && + nextOpt.isDefined && isSentStart(nextOpt.get.word)) { //println(s"FOUND EOS INSIDE TOKEN: ${crt.raw}") // // create the last token from the token fragment before the period, and the period itself // - val dotRawPosition = crt.raw.lastIndexOf('.') + val dotRawPosition = curr.raw.lastIndexOf('.') assert(dotRawPosition > 0) - val dotWordPosition = crt.word.lastIndexOf('.') + val dotWordPosition = curr.word.lastIndexOf('.') assert(dotWordPosition > 0) - raw += crt.raw.substring(0, dotRawPosition) - words += crt.word.substring(0, dotWordPosition) - beginPositions += crt.beginPosition - endPositions += crt.beginPosition + dotRawPosition + raw += curr.raw.substring(0, dotRawPosition) + words += curr.word.substring(0, dotWordPosition) + beginPositions += curr.beginPosition + characterOffset + endPositions += curr.beginPosition + dotRawPosition + characterOffset // This is just for the period with length of 1. - raw += crt.raw.substring(dotRawPosition, dotRawPosition + 1) - words += crt.word.substring(dotWordPosition, dotWordPosition + 1) + raw += curr.raw.substring(dotRawPosition, dotRawPosition + 1) + words += curr.word.substring(dotWordPosition, dotWordPosition + 1) beginPositions += endPositions.last endPositions += beginPositions.last + 1 val lastPosition = endPositions.last @@ -104,7 +105,7 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { // // create the current sentence // - sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray) + sentences += Sentence(raw, beginPositions, endPositions, words) raw = new ArrayBuffer[String]() words = new ArrayBuffer[String]() beginPositions = new ArrayBuffer[Int]() @@ -113,27 +114,27 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { // // add the part of the token after the period to the new sentence // - raw += crt.raw.substring(dotRawPosition + 1) - words += crt.word.substring(dotWordPosition + 1) + raw += curr.raw.substring(dotRawPosition + 1) + words += curr.word.substring(dotWordPosition + 1) beginPositions += lastPosition endPositions += lastPosition + raw.head.length } else { // just a regular token - raw += crt.raw - words += crt.word - beginPositions += crt.beginPosition - endPositions += crt.endPosition + raw += curr.raw + words += curr.word + beginPositions += curr.beginPosition + characterOffset + endPositions += curr.endPosition + characterOffset } } // a few words left over at the end if (words.nonEmpty) { - sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray) + sentences += Sentence(raw, beginPositions, endPositions, words) } - sentences.toArray + sentences } def isAbbreviation(word:String):Boolean diff --git a/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala b/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala index 85c6a09bc..11fbff7fb 100644 --- a/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala +++ b/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala @@ -70,7 +70,7 @@ class Tokenizer( } /** Tokenization and sentence splitting */ - def tokenize(text: String, sentenceSplit: Boolean = true): Array[Sentence] = { + def tokenize(text: String, sentenceSplit: Boolean = true, characterOffset: Int = 0): Seq[Sentence] = { // raw tokenization, using the antlr grammar val rawTokens = readTokens(text) // now apply all the additional non-Antlr steps such as solving contractions, normalization, post-processing @@ -78,7 +78,7 @@ class Tokenizer( step.process(rawTokens) } // sentence splitting, including detection of abbreviations - val sentences = sentenceSplitter.split(stepTokens, sentenceSplit) + val sentences = sentenceSplitter.split(stepTokens, sentenceSplit, characterOffset) sentences } diff --git a/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala index 3278df5f2..d9fb83262 100644 --- a/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala @@ -168,7 +168,7 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( // original sentence val origSentence = sentences(sentOffset) // actual sentence to be used - val sentence = if (leftToRight) origSentence else origSentence.revert() + val sentence = if (leftToRight) origSentence else origSentence.reverse() // labels to be learned val labels = if (leftToRight) labelExtractor(origSentence) @@ -211,7 +211,7 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( origSentence: Sentence, firstPassLabels:Option[Array[L]], leftToRight:Boolean): Array[L] = { - val sentence = if(leftToRight) origSentence else origSentence.revert() + val sentence = if(leftToRight) origSentence else origSentence.reverse() val firstPass = if(firstPassLabels.nonEmpty) { @@ -233,7 +233,7 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( if(leftToRight) history.toArray else SeqUtils.revert(history).toArray } - override def classesOf(sentence: Sentence):Array[L] = { + override def classesOf(sentence: Sentence):Seq[L] = { var firstPassLabels:Option[Array[L]] = None if(firstPassModel.nonEmpty) firstPassLabels = Some(classesOf(firstPassModel.get, sentence, None, ! leftToRight)) diff --git a/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala index cbc12a745..9ab41afda 100644 --- a/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala @@ -2,6 +2,7 @@ package org.clulab.sequences import org.clulab.processors.Sentence import org.clulab.sequences.LexiconNER._ +import org.clulab.scala.WrappedArray._ import org.clulab.struct.EntityValidator import org.clulab.struct.IntHashTrie @@ -64,7 +65,7 @@ class CombinedLexiconNER ( * @param sentence The input sentence * @return An array of BIO notations the store the outcome of the matches */ - def find(sentence: Sentence): Array[String] = { + def find(sentence: Sentence): Seq[String] = { val caseSensitiveTokens = getTokens(sentence) val caseInsensitiveTokens = if (hasCaseInsensitive) caseSensitiveTokens.map(_.toLowerCase) else caseSensitiveTokens val seq = findLongestMatch(sentence, caseSensitiveTokens, caseInsensitiveTokens) @@ -79,7 +80,7 @@ class CombinedLexiconNER ( * This means that the longest match is always chosen, even if coming from a matcher with lower priority * Only ties are disambiguated according to the order provided in the constructor */ - protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Array[String], caseInsensitiveTokens: Array[String]): Array[String] = { + protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Seq[String], caseInsensitiveTokens: Seq[String]): Seq[String] = { val labels = new Array[String](caseSensitiveTokens.length) val length = labels.length var offset = 0 @@ -91,7 +92,7 @@ class CombinedLexiconNER ( def getSpanAndIndex: CombinedLexiconNER.SpanAndIndex = { - def innerGetSpanAndIndex(condition: Boolean, intHashTrie: IntHashTrie, tokens: => Array[String]): CombinedLexiconNER.SpanAndIndex = { + def innerGetSpanAndIndex(condition: Boolean, intHashTrie: IntHashTrie, tokens: => Seq[String]): CombinedLexiconNER.SpanAndIndex = { if (condition) { val intTrieNodeMatch = intHashTrie.findAt(tokens, offset) CombinedLexiconNER.SpanAndIndex(intTrieNodeMatch.length, intTrieNodeMatch.completePath) diff --git a/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala index 924c8688f..2d16ddbf2 100644 --- a/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala @@ -1,15 +1,15 @@ package org.clulab.sequences -import java.io.ObjectInputStream -import java.io.ObjectOutputStream -import java.util.Arrays - import org.clulab.processors.Sentence import org.clulab.sequences.LexiconNER.OUTSIDE_LABEL +import org.clulab.scala.WrappedArray._ import org.clulab.struct.EntityValidator import org.clulab.struct.IntHashTrie import org.clulab.struct.IntTrieNode +import java.io.ObjectInputStream +import java.io.ObjectOutputStream +import java.util.Arrays import scala.collection.mutable /** Lexicon-based NER similar to [[org.clulab.sequences.CombinedLexiconNER CombinedLexiconNER]] but which @@ -56,7 +56,7 @@ class CompactLexiconNER( def getLabels: Seq[String] = labels - def find(sentence: Sentence): Array[String] = { + def find(sentence: Sentence): Seq[String] = { val caseSensitiveTokens = getTokens(sentence) val caseInsensitiveTokens = if (hasCaseInsensitive) caseSensitiveTokens.map(_.toLowerCase) @@ -66,14 +66,14 @@ class CompactLexiconNER( seq } - protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Array[String], - caseInsensitiveTokens: Array[String]): Array[String] = { + protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Seq[String], + caseInsensitiveTokens: Seq[String]): Seq[String] = { val labels = new Array[String](caseSensitiveTokens.length) val length = labels.length var offset = 0 - val caseSensitiveStringIds = if (hasCaseSensitive) caseSensitiveTokens.map( caseSensitiveCompactTrie.stringIds) else Array.empty[Int] - val caseInsensitiveStringIds = if (hasCaseInsensitive) caseInsensitiveTokens.map(caseInsensitiveCompactTrie.stringIds) else Array.empty[Int] + val caseSensitiveStringIds = if (hasCaseSensitive) caseSensitiveTokens.map( caseSensitiveCompactTrie.stringIds) else Seq.empty[Int] + val caseInsensitiveStringIds = if (hasCaseInsensitive) caseInsensitiveTokens.map(caseInsensitiveCompactTrie.stringIds) else Seq.empty[Int] // These are intended to cut down on the number of objects created. // It worked better when there was only one setting for case. @@ -88,7 +88,7 @@ class CompactLexiconNER( def updateSpanAndIndex(): Unit = { - def innerGetSpanAndIndex(condition: Boolean, stringIds: Array[Int], spanAndIndex: SpanAndIndex, + def innerGetSpanAndIndex(condition: Boolean, stringIds: Seq[Int], spanAndIndex: SpanAndIndex, compactTrie: CompactTrie): SpanAndIndex = { if (condition) { val id = stringIds(offset) @@ -136,7 +136,7 @@ class CompactLexiconNER( labels } - def findAt(ids: Array[Int], wordIndex: Int, nodeMatch: SpanAndIndex, compactTrie: CompactTrie): Unit = { + def findAt(ids: Seq[Int], wordIndex: Int, nodeMatch: SpanAndIndex, compactTrie: CompactTrie): Unit = { def linearSearch(value: Int, left: Int, right: Int): Int = { var index = left @@ -376,7 +376,7 @@ object CompactLexiconNER { // Assume that trieNodes are already sorted as much as necessary and all the tokens have stringIds. // Returns the number of parentsAdded and childrenAdded - def add(trieNodes: Array[IntTrieNode], parentOffset: Int, childOffset: Int): (Int, Int) = { + def add(trieNodes: Seq[IntTrieNode], parentOffset: Int, childOffset: Int): (Int, Int) = { // Area between parentOffset and parentOffset + parentRserve is for this recursive pass and // likewise for between childOffset and childOffset + childReserve. val parentReserve = trieNodes.length diff --git a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala index b1c643fd3..688f196b6 100644 --- a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala @@ -1,12 +1,11 @@ package org.clulab.sequences import org.clulab.processors.Sentence +import org.clulab.scala.SeqView import org.clulab.scala.WrappedArray._ import org.clulab.struct.{EntityValidator, TrueEntityValidator} -import org.clulab.utils.ArrayView import java.io.File -import scala.collection.mutable /** * The abstract base class for several concrete child classes used for Named Entity @@ -55,7 +54,7 @@ abstract class LexiconNER(val knownCaseInsensitives: Set[String], val useLemmas: * @param sentence The input sentence * @return An array of BIO notations the store the outcome of the matches */ - def find(sentence: Sentence): Array[String] + def find(sentence: Sentence): Seq[String] def getLabels: Seq[String] /** @@ -74,49 +73,49 @@ abstract class LexiconNER(val knownCaseInsensitives: Set[String], val useLemmas: } } - def hasCondition(wordsView: ArrayView[String], condition: Char => Boolean): Boolean = + def hasCondition(wordsView: SeqView.Type[String], condition: Char => Boolean): Boolean = wordsView.exists(_.exists(condition)) - def hasLetter(wordsView: ArrayView[String]): Boolean = + def hasLetter(wordsView: SeqView.Type[String]): Boolean = hasCondition(wordsView, Character.isLetter) - def hasDigit(wordsView: ArrayView[String]): Boolean = + def hasDigit(wordsView: SeqView.Type[String]): Boolean = hasCondition(wordsView, Character.isDigit) - def hasUpperCaseLetters(wordsView: ArrayView[String]): Boolean = + def hasUpperCaseLetters(wordsView: SeqView.Type[String]): Boolean = hasCondition(wordsView, Character.isUpperCase) - def hasSpace(wordsView: ArrayView[String]): Boolean = wordsView.length > 1 + def hasSpace(wordsView: SeqView.Type[String]): Boolean = wordsView.size > 1 - def countCharacters(wordsView: ArrayView[String]): Int = + def countCharacters(wordsView: SeqView.Type[String]): Int = // Go ahead and calculate them all even though we only need to know if they exceed a value. wordsView.foldLeft(0) { (sum, word) => sum + word.length } - val contentQualifiers: Array[ArrayView[String] => Boolean] = Array( + val contentQualifiers: Array[SeqView.Type[String] => Boolean] = Array( // Start with the quick and easy ones. hasSpace, - { wordsView => countCharacters(wordsView) > LexiconNER.KNOWN_CASE_INSENSITIVE_LENGTH }, + { (wordsView: SeqView.Type[String]) => countCharacters(wordsView) > LexiconNER.KNOWN_CASE_INSENSITIVE_LENGTH }, hasDigit, hasUpperCaseLetters, - { wordsView => knownCaseInsensitives.contains(wordsView.head) } + { (wordsView: SeqView.Type[String]) => knownCaseInsensitives.contains(wordsView.head) } ) protected def contentfulSpan(sentence: Sentence, start: Int, length: Int): Boolean = { - val wordsView = ArrayView(sentence.words, start, start + length) + val wordsView = sentence.words.view.slice(start, start + length) // A valid view/span must have a letter and at least one of the other qualifiers. val contentful = hasLetter(wordsView) && contentQualifiers.exists(_(wordsView)) contentful } - protected val getTokens: Sentence => Array[String] = + protected val getTokens: Sentence => Seq[String] = // Decide this once and for all and don't revisit it each time getTokens is called. if (useLemmas) getLemmas else getWords - protected def getLemmas(sentence: Sentence): Array[String] = sentence.lemmas.get + protected def getLemmas(sentence: Sentence): Seq[String] = sentence.lemmas.get - protected def getWords(sentence: Sentence): Array[String] = sentence.words + protected def getWords(sentence: Sentence): Seq[String] = sentence.words } object LexiconNER { @@ -313,7 +312,7 @@ object LexiconNER { var upperCaseLetters = 0 val spaces = math.max(0, end - start - 1) // Spaces are between words, not after them. - ArrayView(words, start, end).foreach { word => + words.view.slice(start, end).foreach { word => characters += word.length word.foreach { c => if (Character.isLetter(c)) letters += 1 @@ -346,7 +345,7 @@ object LexiconNER { while (offset < length) { val notOutsideCount = countWhile(src, offset, isNotOutside) // Check that there is not anything in dst that should not be overwritten. - if (!ArrayView(dst, offset, offset + notOutsideCount).exists(isNotOutside(_))) + if (!dst.view.slice(offset, offset + notOutsideCount).exists(isNotOutside(_))) Array.copy(src, offset, dst, offset, notOutsideCount) offset += notOutsideCount diff --git a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala index aa6ac8b47..ff2dacaab 100644 --- a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala @@ -14,7 +14,7 @@ import scala.reflect.ClassTag import scala.util.Using /** - * Sequence tagger using a maximum entrop Markov model (MEMM) + * Sequence tagger using a maximum entropy Markov model (MEMM) * User: mihais * Date: 8/26/17 */ @@ -32,7 +32,7 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v var sentCount = 0 for(doc <- docs; origSentence <- doc.sentences) { // labels and features for one sentence - val sentence = if(leftToRight) origSentence else origSentence.revert() + val sentence = if(leftToRight) origSentence else origSentence.reverse() val labels = if(leftToRight) labelExtractor(origSentence) else SeqUtils.revert(labelExtractor(origSentence)).toArray @@ -67,8 +67,8 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v logger.debug("Finished training.") } - override def classesOf(origSentence: Sentence):Array[L] = { - val sentence = if(leftToRight) origSentence else origSentence.revert() + override def classesOf(origSentence: Sentence):Seq[L] = { + val sentence = if(leftToRight) origSentence else origSentence.reverse() val history = new ArrayBuffer[L]() for(i <- 0 until sentence.size) { @@ -80,7 +80,7 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v history += label } - if(leftToRight) history.toArray else SeqUtils.revert(history).toArray + if(leftToRight) history else SeqUtils.revert(history) } override def save(file: File): Unit = { diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 1c2b2bcb9..a8f2a8da8 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -1,5 +1,9 @@ package org.clulab.sequences +import org.clulab.utils.WrappedArraySeq + +import scala.collection.mutable + // This is definitely not the most efficient as far as number of objects // created, but there should be a NamedEntity thing to hold and not just // shadows of it projected onto the BIO notation in an array of strings. @@ -23,7 +27,7 @@ object NamedEntity { val INSIDE = "I-" val OUTSIDE = "O" - def collect(bioLabels: IndexedSeq[String]): IndexedSeq[NamedEntity] = { + def collect(bioLabels: Seq[String]): Seq[NamedEntity] = { def mkNamedEntity(label: String, begin: Int): NamedEntity = { // Start looking for the end one after the begin. @@ -41,48 +45,51 @@ object NamedEntity { namedEntities } - def combine(bioLabels: Array[String], genericNamedEntities: Seq[NamedEntity], customNamedEntities: Seq[NamedEntity]): Array[String] = { + def combine(bioLabels: Seq[String], genericNamedEntities: Seq[NamedEntity], customNamedEntities: Seq[NamedEntity]): Seq[String] = { + val bioLabelsArray = bioLabels.toArray // Neither named entities sequence can contain overlapping elements within the sequence. // At most, there is overlap between sequences. Use is made of that fact. // The NamedEntities never have empty Ranges, so end - 1 is always at least start. - val outsides = bioLabels.indices.filter(bioLabels(_) == OUTSIDE) + val outsides = bioLabelsArray.indices.filter(bioLabelsArray(_) == OUTSIDE) val validStarts = (genericNamedEntities.map(_.range.start) ++ outsides).toSet // The -1 is used to coordinate ends (exclusive) with the OUTSIDE positions (inclusive). val validEnds = (genericNamedEntities.map(_.range.end - 1) ++ outsides).toSet + val validCustomNamedEntities = customNamedEntities.filter { customNamedEntity => + validStarts(customNamedEntity.range.start) && validEnds(customNamedEntity.range.end - 1) + } - customNamedEntities.foreach { customNamedEntity => - if (validStarts(customNamedEntity.range.start) && validEnds(customNamedEntity.range.end - 1)) - customNamedEntity.fill(bioLabels) + validCustomNamedEntities.foreach { customNamedEntity => + customNamedEntity.fill(bioLabelsArray) } - bioLabels + WrappedArraySeq(bioLabelsArray).toImmutableSeq } - def isValid(bioLabels: Array[String], index: Int): Boolean = { - val currBioLabel = bioLabels(index) - !currBioLabel.startsWith(INSIDE) || { - 0 < index && { - val prevBioLabel = bioLabels(index - 1) - prevBioLabel == currBioLabel || { - prevBioLabel == toBegin(currBioLabel) - } - } + // Only INSIDEs can be invalid, and they are made valid by + // converting them into a BEGIN. + def toBegin(bioLabel: String): String = BEGIN + bioLabel.drop(INSIDE.length) + + def isValid(bioLabels: Seq[String]): Boolean = bioLabels.indices.forall { index => + isValid(bioLabels(index), bioLabels.lift(index - 1)) + } + + def isValid(currBioLabel: String, prevBioLabelOpt: Option[String]): Boolean = { + !currBioLabel.startsWith(INSIDE) || prevBioLabelOpt.exists { prevBioLabel => + prevBioLabel == currBioLabel || prevBioLabel == toBegin(currBioLabel) } } - def isValid(bioLabels: Array[String]): Boolean = - bioLabels.indices.forall(isValid(bioLabels, _)) + def patch(bioLabels: Seq[String]): Seq[String] = { + var prevBioLabelOpt = bioLabels.lift(-1) + val newBioLabels = bioLabels.indices.map { index => + val oldBioLabel = bioLabels(index) + val newBioLabel = + if (!isValid(oldBioLabel, prevBioLabelOpt)) toBegin(oldBioLabel) + else oldBioLabel - // Only INSIDEs can be invalid and they are made valid by - // converting them into a BEGIN. - def toBegin(bioLabel: String): String = - BEGIN + bioLabel.drop(INSIDE.length) - - // Note that this patches the array in place! - def patch(bioLabels: Array[String]): Array[String] = { - bioLabels.indices.foreach { index => - if (!isValid(bioLabels, index)) - bioLabels(index) = toBegin(bioLabels(index)) + prevBioLabelOpt = Some(newBioLabel) + newBioLabel } - bioLabels + + newBioLabels } } diff --git a/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala index 435b91b5d..852ba1d69 100644 --- a/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala @@ -63,7 +63,7 @@ class SeparatedLexiconNER( * @param sentence The input sentence * @return An array of BIO notations the store the outcome of the matches */ - def find(sentence: Sentence): Array[String] = { + def find(sentence: Sentence): Seq[String] = { val seq = findLongestMatch(sentence) seq } @@ -110,7 +110,7 @@ class SeparatedLexiconNER( labels } - protected def findAt(tokens: Array[String], caseInsensitiveTokens: Array[String], offset: Int): (Int, Int) = { + protected def findAt(tokens: Seq[String], caseInsensitiveTokens: Seq[String], offset: Int): (Int, Int) = { def findAt(matcher: BooleanHashTrie): Int = matcher.findAt(if (matcher.caseInsensitive) caseInsensitiveTokens else tokens, offset).length diff --git a/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala index 6c902e89f..76081875a 100644 --- a/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala @@ -15,7 +15,7 @@ import scala.util.Using trait SequenceTagger[L, F] extends Tagger[L] { def train(docs:Iterator[Document]): Unit - def classesOf(sentence: Sentence):Array[L] + def classesOf(sentence: Sentence):Seq[L] /** Abstract method that generates the features for the word at the position offset in the given sentence */ def featureExtractor(features:Counter[F], sentence: Sentence, offset:Int): Unit @@ -23,7 +23,7 @@ trait SequenceTagger[L, F] extends Tagger[L] { /** Abstract method that extracts the training labels for a given sentence */ def labelExtractor(sentence:Sentence): Array[L] - override def find(sentence: Sentence): Array[L] = classesOf(sentence) + override def find(sentence: Sentence): Seq[L] = classesOf(sentence) def save(fn:File): Unit diff --git a/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala b/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala index 9bcb7368f..1b4566e68 100644 --- a/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala +++ b/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala @@ -5,6 +5,7 @@ import java.io.File import jline.console.ConsoleReader import jline.console.history.FileHistory import org.clulab.processors.Sentence +import org.clulab.scala.WrappedArray._ /** * Simple shell for sequence taggers diff --git a/library/src/main/scala/org/clulab/sequences/Tagger.scala b/library/src/main/scala/org/clulab/sequences/Tagger.scala index 973e4dba2..8e3f54e5c 100644 --- a/library/src/main/scala/org/clulab/sequences/Tagger.scala +++ b/library/src/main/scala/org/clulab/sequences/Tagger.scala @@ -8,5 +8,5 @@ import org.clulab.processors.Sentence * Date: 10/12/17 */ trait Tagger[L] { - def find(sentence:Sentence):Array[L] + def find(sentence:Sentence):Seq[L] } diff --git a/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala b/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala index 523873e65..9508b133f 100644 --- a/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala @@ -8,7 +8,7 @@ object CoNLLUSerializer { val UNDEF = "_" val ROOT = "root" - def getOrElseUndef(stringsOpt: Option[Array[String]], i: Int): String = + def getOrElseUndef(stringsOpt: Option[Seq[String]], i: Int): String = stringsOpt.map(_(i)).getOrElse(UNDEF) /** diff --git a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index 8016375ee..093185b7f 100644 --- a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -3,6 +3,7 @@ package org.clulab.serialization import org.clulab.processors.DocumentAttachment import org.clulab.processors.DocumentAttachmentBuilderFromText import org.clulab.processors.{Document, Sentence} +import org.clulab.scala.WrappedArrayBuffer._ import org.clulab.struct._ import org.clulab.utils.Logging import org.json4s.DefaultFormats @@ -102,13 +103,9 @@ class DocumentSerializer extends Logging { assert(bits(0) == END_OF_DOCUMENT, s"END_OF_DOCUMENT expected, found ${bits(0)}") - val doc = Document(sents.toArray) - doc.coreferenceChains = coref - doc.text = text - - // TODO: Hack by Enrique to resolve the document object for the relations - for(sen <- doc.sentences){ + /* + val relationsOpt = for(sen <- sents){ sen.relations match { case Some(relations) => val newRelations = relations.map(r => RelationTriple(r.confidence, r.subjectInterval, r.relationInterval, r.objectInterval)) @@ -116,13 +113,18 @@ class DocumentSerializer extends Logging { case None => () } } + */ - namedDocumentAttachmentsOpt.foreach { namedDocumentAttachments => - namedDocumentAttachments.foreach { case (name: String, documentAttachment: DocumentAttachment) => - doc.addAttachment(name, documentAttachment) - } + val attachmentsOpt = namedDocumentAttachmentsOpt.map { namedDocumentAttachments => + namedDocumentAttachments.toMap } + val doc = new Document( + sentences = sents, + text = text, + attachments = attachmentsOpt + ) + doc } @@ -166,7 +168,7 @@ class DocumentSerializer extends Logging { Interval(t(0), t(1)) } - private def loadRelations(r: BufferedReader, sz: Int):Option[Array[RelationTriple]] = { + private def loadRelations(r: BufferedReader, sz: Int):Option[Seq[RelationTriple]] = { val ret = (0 until sz) map { _ => val line = r.readLine() @@ -174,7 +176,7 @@ class DocumentSerializer extends Logging { val relInterval = tokens(2) match { case "N" => None; case s => Some(mkRelationInterval(s)) } RelationTriple(tokens(0).toFloat, mkRelationInterval(tokens(1)), relInterval, mkRelationInterval(tokens(3))) } - Some(ret.toArray) + Some(ret) } private def loadSentence(r:BufferedReader): Sentence = { @@ -230,9 +232,9 @@ class DocumentSerializer extends Logging { assert(normBuffer.isEmpty || normBuffer.size == tokenCount) assert(chunkBuffer.isEmpty || chunkBuffer.size == tokenCount) - var deps = GraphMap() + var deps = GraphMap.empty var tree:Option[Tree] = None - var relations:Option[Array[RelationTriple]] = None + var relations:Option[Seq[RelationTriple]] = None while ({ bits = read(r) if (bits(0) == START_DEPENDENCIES) { @@ -252,10 +254,10 @@ class DocumentSerializer extends Logging { }) () Sentence( - rawBuffer.toArray, - startOffsetBuffer.toArray, - endOffsetBuffer.toArray, - wordBuffer.toArray, + rawBuffer, + startOffsetBuffer, + endOffsetBuffer, + wordBuffer, bufferOption(tagBuffer, nilTags), bufferOption(lemmaBuffer, nilLemmas), bufferOption(entityBuffer, nilEntities), @@ -288,10 +290,10 @@ class DocumentSerializer extends Logging { dg } - private def bufferOption[T: ClassTag](b:ArrayBuffer[T], allNils:Boolean): Option[Array[T]] = { + private def bufferOption[T: ClassTag](b:ArrayBuffer[T], allNils:Boolean): Option[Seq[T]] = { if (b.isEmpty) None else if (allNils) None - else Some(b.toArray) + else Some(b) } def save(doc:Document, os:PrintWriter): Unit = save(doc, os, keepText = false) @@ -329,11 +331,12 @@ class DocumentSerializer extends Logging { } // Sort these so that serialization is the same each time. - val attachmentKeys = doc.getAttachmentKeys.toList.sorted + val attachments = doc.attachments.getOrElse(Map.empty) + val attachmentKeys = attachments.keySet if (attachmentKeys.nonEmpty) { os.println(START_ATTACHMENTS + SEP + attachmentKeys.size) attachmentKeys.foreach { key => - val value = doc.getAttachment(key).get + val value = attachments(key) os.print(escapeAttachment(key)) os.print(SEP) os.print(escapeAttachment(value.documentAttachmentBuilderFromTextClassName)) diff --git a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index 181400b2d..ebc20d8b7 100644 --- a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -1,10 +1,9 @@ package org.clulab.serialization.json import java.io.File -import org.clulab.processors.DocumentAttachmentBuilderFromJson -import org.clulab.processors.{Document, Sentence} +import org.clulab.processors.{Document, DocumentAttachment, DocumentAttachmentBuilderFromJson, DocumentAttachments, Sentence} import org.clulab.struct.Edge -import org.clulab.struct.{DirectedGraph, GraphMap} +import org.clulab.struct.DirectedGraph import org.clulab.utils.FileUtils import org.json4s import org.json4s.JsonDSL._ @@ -12,6 +11,8 @@ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.prettyJson +import scala.collection.mutable + /** JSON serialization utilities */ // This annotation is to avoid "Compiler synthesis of Manifest and OptManifest is deprecated". @@ -23,12 +24,12 @@ object JSONSerializer { def jsonAST(f: File): JValue = jsonAST(FileUtils.getTextFromFile(f)) - protected def addDocumentAttachments(doc: Document, jValue: JValue): Unit = { + protected def getDocumentAttachments(jValue: JValue): Option[DocumentAttachments.Type] = { // See also DocumentSerializer for text version of nearly the same thing. (jValue \ DOCUMENT_ATTACHMENTS_KEY) match { case jObject: JObject => val keys = jObject.values.keys - keys.foreach { (key: String) => + val keyAndDocumentAttachmentPairs = keys.flatMap { (key: String) => (jObject \ key) match { case jObject: JObject => val documentAttachmentBuilderFromJsonClassName = (jObject \ DOCUMENT_ATTACHMENTS_BUILDER_KEY).extract[String] @@ -38,28 +39,36 @@ object JSONSerializer { val documentAttachmentBuilder = obj.asInstanceOf[DocumentAttachmentBuilderFromJson] val value = (jObject \ DOCUMENT_ATTACHMENTS_VALUE_KEY) val documentAttachment = documentAttachmentBuilder.mkDocumentAttachment(value) - doc.addAttachment(key, documentAttachment) + + Some((key, documentAttachment)) case jValue: JValue => val text = prettyJson(jValue) throw new RuntimeException(s"ERROR: While deserializing document attachments expected JObject but found this: $text") // case _ => // noop. It should never get here. (Famous last words.) - case null => // noop. It should never get here. (Famous last words.) Scala 3 prefers null over _. + case null => None // noop. It should never get here. (Famous last words.) Scala 3 prefers null over _. } } + Some(keyAndDocumentAttachmentPairs.toMap) case _ => // Leave documentAttachments as is: None + None } } def toDocument(json: JValue): Document = { // recover sentences val sentences = (json \ "sentences").asInstanceOf[JArray].arr.map(sjson => toSentence(sjson)).toArray + val id = getStringOption(json, "id") + val text = getStringOption(json, "text") // initialize document - val d = Document(sentences) - // update id - d.id = getStringOption(json, "id") - // update text - d.text = getStringOption(json, "text") - addDocumentAttachments(d, json) + val attachments = getDocumentAttachments(json) + val d = new Document( + id = id, + sentences = sentences, + coreferenceChains = None, + text = text, + attachments = attachments + ) + d } def toDocument(docHash: String, djson: JValue): Document = toDocument(djson \ docHash) @@ -68,25 +77,40 @@ object JSONSerializer { def toSentence(json: JValue): Sentence = { - def getLabels(json: JValue, k: String): Option[Array[String]] = json \ k match { + def getStrings(json: JValue, k: String): Array[String] = (json \ k).extract[Array[String]] + + def getInts(json: JValue, k: String): Array[Int] = (json \ k).extract[Array[Int]] + + def getLabelsOpt(json: JValue, k: String): Option[Seq[String]] = json \ k match { case JNothing => None case contents => Some(contents.extract[Array[String]]) } - val s = json.extract[Sentence] - val preferredSize = s.words.length - // build dependencies - val graphs = (json \ "graphs").extract[JObject].obj.map { case (key, json) => - key -> toDirectedGraph(json, Some(preferredSize)) - }.toMap - s.graphs = GraphMap(graphs) - // build labels - s.tags = getLabels(json, "tags") - s.lemmas = getLabels(json, "lemmas") - s.entities = getLabels(json, "entities") - s.norms = getLabels(json, "norms") - s.chunks = getLabels(json, "chunks") - s + val raw = getStrings(json, "raw") + val startOffsets = getInts(json, "startOffsets") + val endOffsets = getInts(json, "endOffsets") + val words = getStrings(json, "words") + val tags = getLabelsOpt(json, "tags") + val lemmas = getLabelsOpt(json, "lemmas") + val entities = getLabelsOpt(json, "entities") + val norms = getLabelsOpt(json, "norms") + val chunks = getLabelsOpt(json, "chunks") + val syntacticTree = None // TODO: Are these not serialized? + val graphs = { + val preferredSize = words.length + val graphs = (json \ "graphs").extract[JObject].obj.map { case (key, json) => + key -> toDirectedGraph(json, Some(preferredSize)) + }.toMap + + graphs + } + val relations = None // TODO: Are these not serialized? + val parsedSentence = Sentence( + raw, startOffsets, endOffsets, words, + tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations + ) + + parsedSentence } def toDirectedGraph(json: JValue, preferredSizeOpt: Option[Int] = None): DirectedGraph[String] = { diff --git a/library/src/main/scala/org/clulab/serialization/json/package.scala b/library/src/main/scala/org/clulab/serialization/json/package.scala index 27adb3fd9..a0fbf4f0e 100644 --- a/library/src/main/scala/org/clulab/serialization/json/package.scala +++ b/library/src/main/scala/org/clulab/serialization/json/package.scala @@ -22,7 +22,7 @@ package object json { } // Arrays cannot be directly converted to JValue - implicit class ArrayOps(s: Option[Array[String]]) { + implicit class ArrayOps(s: Option[Seq[String]]) { def toSerializableJSON: Option[List[String]] = s match { case Some(s) => Some(s.toList) case None => None @@ -52,8 +52,8 @@ package object json { } } - implicit class GraphMapOps(gm: GraphMap) extends JSONSerialization { - def jsonAST: JValue = Extraction.decompose(gm.toMap.map { case (k, v) => k -> v.jsonAST }) // instead of mapValues + implicit class GraphMapOps(gm: GraphMap.Type) extends JSONSerialization { + def jsonAST: JValue = Extraction.decompose(gm.map { case (k, v) => k -> v.jsonAST }) // instead of mapValues } /** For Document */ @@ -61,10 +61,11 @@ package object json { def jsonAST: JValue = { // See also DocumentSerializer for a similar text implementation. - val attachmentKeys = doc.getAttachmentKeys.toList.sorted + val attachments = doc.attachments.getOrElse(Map.empty) + val attachmentKeys = attachments.keySet.toList.sorted val documentAttachments: JValue = if (attachmentKeys.nonEmpty) { val jFields = attachmentKeys.map { key => - val value = doc.getAttachment(key).get + val value = attachments(key) JField(key, (DOCUMENT_ATTACHMENTS_BUILDER_KEY -> JString(value.documentAttachmentBuilderFromJsonClassName)) ~ (DOCUMENT_ATTACHMENTS_VALUE_KEY -> value.toJsonSerializer) diff --git a/library/src/main/scala/org/clulab/struct/Annotation.scala b/library/src/main/scala/org/clulab/struct/Annotation.scala new file mode 100644 index 000000000..d9f390a86 --- /dev/null +++ b/library/src/main/scala/org/clulab/struct/Annotation.scala @@ -0,0 +1,37 @@ +package org.clulab.struct + +// These are by the word ones and then there are relationships between words. +// So parse, might not be a thing that is per word. +//case class WordParse(tag: String, lemma: String, entity: String, norm: String, chunk: String) + +//case class SentenceParse(tags: Array[String], syntacticTree, graphs, relations) + +case class Annotation( + tags: Option[Array[String]] = None, + /** Lemmas */ + lemmas: Option[Array[String]] = None, + /** NE labels */ + entities: Option[Array[String]] = None, + /** Normalized values of named/numeric entities, such as dates */ + norms: Option[Array[String]] = None, + /** Shallow parsing labels */ + chunks: Option[Array[String]] = None, + /** Constituent tree of this sentence; includes head words */ + syntacticTree: Option[Tree] = None, + /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ + graphs: GraphMap.Type = GraphMap.empty, + /** Relation triples from OpenIE */ + relations:Option[Array[RelationTriple]] = None +) { + + def reverse: Annotation = { + Annotation( + tags = tags.map(_.reverse), + lemmas = lemmas.map(_.reverse), + entities = entities.map(_.reverse), + norms = norms.map(_.reverse), + chunks = chunks.map(_.reverse) + // TODO: reverse syntacticTree, graphs, and relations! + ) + } +} diff --git a/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala b/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala index fd0b586fa..5ab19e3ad 100644 --- a/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala +++ b/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala @@ -80,7 +80,7 @@ class BooleanHashTrie(val label: String, val caseInsensitive: Boolean = true) ex * When multiple paths are found, the longest one is kept * Text must be normalized (i.e., case folding) BEFORE this call, if necessary! */ - def findAt(sequenceNormalized: Array[String], offset: Int): BooleanTrieNode.Match = { + def findAt(sequenceNormalized: Seq[String], offset: Int): BooleanTrieNode.Match = { val longestMatch = new BooleanTrieNode.Match() entries.get(sequenceNormalized(offset)).map { tree => @@ -129,7 +129,7 @@ case class BooleanTrieNode(token: String, var completePath: Boolean, var childre * @param longestMatch The value of the longest match interval * @return true if search should stop here; false otherwise */ - def find(sequence: Array[String], + def find(sequence: Seq[String], startOffset: Int, currentSpanLength: Int, longestMatch: BooleanTrieNode.Match): Boolean = { @@ -261,13 +261,13 @@ class DebugBooleanHashTrie(label: String, caseInsensitive: Boolean = true) exten * Generates BIO labels for this sequence when complete trie paths match * When multiple paths match, the longest one is kept */ - def find(sequence: Array[String], outsideLabel: String): Array[String] = { + def find(sequence: Seq[String], outsideLabel: String): Array[String] = { val casedSequence = if (caseInsensitive) sequence.map(_.toLowerCase) else sequence findNormalized(casedSequence, outsideLabel) } - private def findNormalized(sequence: Array[String], outsideLabel: String): Array[String] = { + private def findNormalized(sequence: Seq[String], outsideLabel: String): Array[String] = { val labels = new Array[String](sequence.length) var offset = 0 diff --git a/library/src/main/scala/org/clulab/struct/DependencyMapNames.scala b/library/src/main/scala/org/clulab/struct/DependencyMapNames.scala deleted file mode 100644 index 82a8b39ab..000000000 --- a/library/src/main/scala/org/clulab/struct/DependencyMapNames.scala +++ /dev/null @@ -1,7 +0,0 @@ -package org.clulab.struct - -trait DependencyMapNames { - val STANFORD_BASIC = 0 // basic Stanford dependencies - val STANFORD_COLLAPSED = 1 // collapsed Stanford dependencies - val SEMANTIC_ROLES = 2 // semantic roles from CoNLL 2008-09, which includes PropBank and NomBank -} diff --git a/library/src/main/scala/org/clulab/struct/GraphMapNames.scala b/library/src/main/scala/org/clulab/struct/GraphMap.scala similarity index 86% rename from library/src/main/scala/org/clulab/struct/GraphMapNames.scala rename to library/src/main/scala/org/clulab/struct/GraphMap.scala index 012f0f52a..6857916e3 100644 --- a/library/src/main/scala/org/clulab/struct/GraphMapNames.scala +++ b/library/src/main/scala/org/clulab/struct/GraphMap.scala @@ -1,6 +1,10 @@ package org.clulab.struct -trait GraphMapNames { +object GraphMap { + type Type = Map[String, DirectedGraph[String]] + + val empty: Type = Map.empty + val UNIVERSAL_BASIC = "universal-basic" // basic Universal dependencies val UNIVERSAL_ENHANCED = "universal-enhanced" // collapsed (or enhanced) Universal dependencies val STANFORD_BASIC = "stanford-basic" // basic Stanford dependencies diff --git a/library/src/main/scala/org/clulab/struct/HashTrie.scala b/library/src/main/scala/org/clulab/struct/HashTrie.scala index 331858735..1bcd8c0af 100644 --- a/library/src/main/scala/org/clulab/struct/HashTrie.scala +++ b/library/src/main/scala/org/clulab/struct/HashTrie.scala @@ -4,11 +4,11 @@ package org.clulab.struct class HashTrie(caseInsensitive: Boolean = true) extends BooleanHashTrie("", caseInsensitive) { - def find(sequence:Array[String], label: String, outsideLabel: String): Array[String] = + def find(sequence:Seq[String], label: String, outsideLabel: String): Array[String] = if (caseInsensitive) findNormalized(sequence.map(_.toLowerCase), label, outsideLabel) else findNormalized(sequence, label, outsideLabel) - protected def findNormalized(tokens: Array[String], label: String, outsideLabel: String): Array[String] = { + protected def findNormalized(tokens: Seq[String], label: String, outsideLabel: String): Array[String] = { val labels = new Array[String](tokens.length) lazy val bLabel = "B-" + label // lazy thinking that most calls will not use it lazy val iLabel = "I-" + label diff --git a/library/src/main/scala/org/clulab/struct/IntHashTrie.scala b/library/src/main/scala/org/clulab/struct/IntHashTrie.scala index 70a22984e..9b3403cc5 100644 --- a/library/src/main/scala/org/clulab/struct/IntHashTrie.scala +++ b/library/src/main/scala/org/clulab/struct/IntHashTrie.scala @@ -82,7 +82,7 @@ class IntHashTrie(val caseInsensitive: Boolean = true) extends Serializable { * When multiple paths are found, the longest one is kept * Text must be normalized (i.e., case folding) BEFORE this call, if necessary! */ - def findAt(sequenceNormalized: Array[String], offset: Int): IntTrieNode.Match = { + def findAt(sequenceNormalized: Seq[String], offset: Int): IntTrieNode.Match = { val longestMatch = new IntTrieNode.Match() entries.get(sequenceNormalized(offset)).map { tree => @@ -134,7 +134,7 @@ case class IntTrieNode(token:String, var completePath: Int, var children: Option * @param longestMatch The value of the longest match interval * @return true if search should stop here; false otherwise */ - def find(sequence: Array[String], + def find(sequence: Seq[String], startOffset: Int, currentSpanLength: Int, longestMatch: IntTrieNode.Match): Boolean = { diff --git a/library/src/main/scala/org/clulab/struct/Tokenization.scala b/library/src/main/scala/org/clulab/struct/Tokenization.scala new file mode 100644 index 000000000..78e8b21da --- /dev/null +++ b/library/src/main/scala/org/clulab/struct/Tokenization.scala @@ -0,0 +1,21 @@ +package org.clulab.struct + +// An alternative design would not use aligned arrays, but an array of structures. +case class WordTokenization(raw: String, startOffset: Int, endOffset: Int, word: String) + +case class Tokenization( + raw: Array[String], + startOffsets: Array[Int], + endOffsets: Array[Int], + words: Array[String] +) { + + def reverse: Tokenization = { + Tokenization( + raw = raw.reverse, + startOffsets = startOffsets.reverse, + endOffsets = endOffsets.reverse, + words = words.reverse + ) + } +} diff --git a/library/src/main/scala/org/clulab/utils/ArrayView.scala b/library/src/main/scala/org/clulab/utils/ArrayView.scala deleted file mode 100644 index afbd6d42a..000000000 --- a/library/src/main/scala/org/clulab/utils/ArrayView.scala +++ /dev/null @@ -1,37 +0,0 @@ -package org.clulab.utils - -import scala.collection.mutable - -// Array.view(from, until) is no longer available in Scala 2.13+. -class ArrayView[T](array: Array[T], from: Int, until: Int) extends IndexedSeq[T] { - val length = until - from - - override def apply(index: Int): T = array(from + index) -} - -object ArrayView { - - def apply[T](array: Array[T]): ArrayView[T] = apply(array, 0) - - def apply[T](array: Array[T], from: Int): ArrayView[T] = apply(array, from, array.length) - - def apply[T](array: Array[T], from: Int, until: Int): ArrayView[T] = new ArrayView(array, from, until) -} - -// Array.view(from, until) is no longer available in Scala 2.13+. -class MutableArrayView[T](array: Array[T], from: Int, until: Int) extends mutable.IndexedSeq[T] { - val length = until - from - - override def apply(index: Int): T = array(from + index) - - override def update(index: Int, elem: T): Unit = array(from + index) = elem -} - -object MutableArrayView { - - def apply[T](array: Array[T]): MutableArrayView[T] = apply(array, 0) - - def apply[T](array: Array[T], from: Int): MutableArrayView[T] = apply(array, from, array.length) - - def apply[T](array: Array[T], from: Int, until: Int): MutableArrayView[T] = new MutableArrayView(array, from, until) -} diff --git a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala index 3c19d2c1d..1eb8314d5 100644 --- a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala +++ b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala @@ -23,29 +23,29 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer} object ToEnhancedDependencies { type EdgeSpec = (Int, Int, String) - def generateStanfordEnhancedDependencies(sentence:Sentence, dg:DirectedGraph[String]): DirectedGraph[String] = { + def generateStanfordEnhancedDependencies(words: Array[String], tags: Seq[String], dg:DirectedGraph[String]): DirectedGraph[String] = { val dgi = dg.toDirectedGraphIndex() - collapsePrepositionsStanford(sentence, dgi) + collapsePrepositionsStanford(words, dgi) raiseSubjects(dgi) - pushSubjectsObjectsInsideRelativeClauses(sentence, dgi, universal = false) - propagateSubjectsAndObjectsInConjVerbs(sentence, dgi, universal = false) - propagateConjSubjectsAndObjects(sentence, dgi) - dgi.toDirectedGraph(Some(sentence.size)) + pushSubjectsObjectsInsideRelativeClauses(tags, dgi, universal = false) + propagateSubjectsAndObjectsInConjVerbs(tags, dgi, universal = false) + propagateConjSubjectsAndObjects(tags, dgi) + dgi.toDirectedGraph(Some(words.length)) } - def generateUniversalEnhancedDependencies(sentence:Sentence, dg:DirectedGraph[String]): DirectedGraph[String] = { + def generateUniversalEnhancedDependencies(words: Seq[String], lemmas: Seq[String], tags: Seq[String], dg: DirectedGraph[String]): DirectedGraph[String] = { val dgi = dg.toDirectedGraphIndex() - collapseMWEs(sentence, dgi) - val collapsedNmods = collapsePrepositionsUniversal(sentence, dgi) + collapseMWEs(lemmas, tags, dgi) + val collapsedNmods = collapsePrepositionsUniversal(words, lemmas, tags, dgi) replicateCollapsedNmods(collapsedNmods, dgi) raiseSubjects(dgi) - pushSubjectsObjectsInsideRelativeClauses(sentence, dgi, universal = true) - propagateSubjectsAndObjectsInConjVerbs(sentence, dgi, universal = true) - propagateConjSubjectsAndObjects(sentence, dgi) + pushSubjectsObjectsInsideRelativeClauses(tags, dgi, universal = true) + propagateSubjectsAndObjectsInConjVerbs(tags, dgi, universal = true) // requires tags + propagateConjSubjectsAndObjects(tags, dgi) mergeNsubjXcomp(dgi) - replicateCopulativeSubjects(sentence, dgi) - expandConj(sentence, dgi) // this must be last because several of the above methods expect "conj" labels - dgi.toDirectedGraph(Some(sentence.size)) + replicateCopulativeSubjects(dgi) + expandConj(words, dgi) // this must be last because several of the above methods expect "conj" labels + dgi.toDirectedGraph(Some(words.length)) } /** @@ -66,7 +66,7 @@ object ToEnhancedDependencies { * Replicates copulative subjects across conjunctions * It is difficult and expensive => nsubj from 2 to 0 and from 4 to 0 */ - def replicateCopulativeSubjects(sentence: Sentence, dgi: DirectedGraphIndex[String]): Unit = { + def replicateCopulativeSubjects(dgi: DirectedGraphIndex[String]): Unit = { val nsubjs = dgi.findByName("nsubj") for(nsubj <- nsubjs) { val cops = dgi.findByHeadAndName(nsubj.source, "cop") @@ -102,13 +102,13 @@ object ToEnhancedDependencies { * @param sentence * @param dgi */ - def expandConj(sentence: Sentence, dgi: DirectedGraphIndex[String]): Unit = { + def expandConj(words: Seq[String], dgi: DirectedGraphIndex[String]): Unit = { val toRemove = new ListBuffer[Edge[String]] val conjs = dgi.findByName("conj") for (conj <- conjs) { var shouldRemove = false for(cc <- dgi.findByName("cc").filter(_.source == conj.source)) { - val ccWord = sentence.words(cc.destination).toLowerCase() + val ccWord = words(cc.destination).toLowerCase() dgi.addEdge(conj.source, conj.destination, s"conj_$ccWord") shouldRemove = true } @@ -125,12 +125,12 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def collapsePrepositionsStanford(sentence:Sentence, dgi:DirectedGraphIndex[String]): Unit = { + def collapsePrepositionsStanford(words: Array[String], dgi:DirectedGraphIndex[String]): Unit = { val toRemove = new ListBuffer[Edge[String]] val preps = dgi.findByName("prep") for(prep <- preps) { toRemove += prep - val word = sentence.words(prep.destination) + val word = words(prep.destination) for(pobj <- dgi.findByName("pobj").filter(_.source == prep.destination)) { dgi.addEdge(prep.source, pobj.destination, s"prep_$word") toRemove += pobj @@ -140,12 +140,12 @@ object ToEnhancedDependencies { } def collapsePrepositionsUniversal( - sentence:Sentence, + words: Seq[String], lemmas: Seq[String], tags: Seq[String], dgi:DirectedGraphIndex[String]): Seq[EdgeSpec] = { val collapsedNmods = new ArrayBuffer[EdgeSpec]() - collapsePrepositionsUniversalNmodCase(sentence, dgi, collapsedNmods) - collapsePrepositionsUniversalDueTo(sentence, dgi, collapsedNmods) + collapsePrepositionsUniversalNmodCase(words, dgi, collapsedNmods) + collapsePrepositionsUniversalDueTo(lemmas, tags, dgi, collapsedNmods) collapsedNmods } @@ -156,7 +156,7 @@ object ToEnhancedDependencies { * @param dgi The directed graph of collapsed dependencies at this stage */ def collapsePrepositionsUniversalNmodCase( - sentence:Sentence, + words: Seq[String], dgi:DirectedGraphIndex[String], collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { @@ -166,9 +166,9 @@ object ToEnhancedDependencies { for(prep <- preps) { toRemove += prep for(c <- dgi.findByName("case").filter(_.source == prep.destination)) { - val word = sentence.words(c.destination).toLowerCase() + val word = words(c.destination).toLowerCase() // find multi-word prepositions such as "such as" - val mwe = findMultiWord(word, c.destination, sentence, dgi) + val mwe = findMultiWord(word, c.destination, words, dgi) // TODO: add nmod:agent (if word == "by") and passive voice here? dgi.addEdge(prep.source, prep.destination, s"nmod_$mwe") @@ -189,16 +189,15 @@ object ToEnhancedDependencies { * @param dgi The directed graph of collapsed dependencies at this stage */ def collapsePrepositionsUniversalDueTo( - sentence:Sentence, + lemmas: Seq[String], tags: Seq[String], dgi:DirectedGraphIndex[String], collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { - val tags = sentence.tags.get val toRemove = new ListBuffer[Edge[String]] var shouldRemove = false val preps = dgi.findByName("mwe") for(prep <- preps) { - if(sentence.lemmas.get(prep.source) == "due" && sentence.lemmas.get(prep.destination) == "to") { + if(lemmas(prep.source) == "due" && lemmas(prep.destination) == "to") { // found a "due to" MWE for(leftDep <- dgi.findByModifier(prep.source)) { // found the dep from "famine" to "due" @@ -235,15 +234,15 @@ object ToEnhancedDependencies { * @param dgi */ def collapseMWEs( - sentence:Sentence, + lemmas: Seq[String], + tags: Seq[String], dgi:DirectedGraphIndex[String]): Unit = { - val lemmas = sentence.lemmas.get - val tags = sentence.tags.get + val size = lemmas.length val toRemove = new ListBuffer[Edge[String]] var shouldRemove = true - for(i <- 0 until sentence.size - 1) { + for(i <- 0 until size - 1) { if(lemmas(i) == "due" && lemmas(i + 1) == "to" && tags(i) == "IN") { val toHeads = dgi.findByModifier(i + 1) var found = false @@ -262,7 +261,7 @@ object ToEnhancedDependencies { if(shouldRemove) remove(toRemove, dgi) } - def findMultiWord(first: String, firstPos: Int, sentence: Sentence, dgi:DirectedGraphIndex[String]): String = { + def findMultiWord(first: String, firstPos: Int, words: Seq[String], dgi:DirectedGraphIndex[String]): String = { val buffer = new StringBuilder buffer.append(first) @@ -273,7 +272,7 @@ object ToEnhancedDependencies { if(mods.isEmpty) { done = true } else { - val word = sentence.words(mods.head.destination).toLowerCase() + val word = words(mods.head.destination).toLowerCase() buffer.append("_") buffer.append(word) head = mods.head.destination @@ -303,9 +302,8 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def propagateSubjectsAndObjectsInConjVerbs(sentence:Sentence, dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { + def propagateSubjectsAndObjectsInConjVerbs(tags: Seq[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { val conjs = dgi.findByName("conj").sortBy(_.source) - val tags = sentence.tags.get for(conj <- conjs) { val left = math.min(conj.source, conj.destination) val right = math.max(conj.source, conj.destination) @@ -387,9 +385,8 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def propagateConjSubjectsAndObjects(sentence:Sentence, dgi:DirectedGraphIndex[String]): Unit = { + def propagateConjSubjectsAndObjects(tags: Seq[String], dgi:DirectedGraphIndex[String]): Unit = { val conjs = dgi.findByName("conj").sortBy(_.source) - val tags = sentence.tags.get for(conj <- conjs) { val left = math.min(conj.source, conj.destination) val right = math.max(conj.source, conj.destination) @@ -424,11 +421,10 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def pushSubjectsObjectsInsideRelativeClauses(sentence:Sentence, dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { + def pushSubjectsObjectsInsideRelativeClauses(tags: Seq[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { val rels = if(universal) dgi.findByName("acl:relcl") else dgi.findByName("rcmod") - val tags = sentence.tags.get for(rel <- rels) { val head = rel.source diff --git a/library/src/main/scala/org/clulab/utils/WrappedArraySeq.scala b/library/src/main/scala/org/clulab/utils/WrappedArraySeq.scala new file mode 100644 index 000000000..a9f13f830 --- /dev/null +++ b/library/src/main/scala/org/clulab/utils/WrappedArraySeq.scala @@ -0,0 +1,21 @@ +package org.clulab.utils + +import scala.collection.mutable +import scala.collection.compat.immutable.ArraySeq + +class WrappedArraySeq[T](array: Array[T]) { + def toSeq: Seq[T] = toImmutableSeq + + def toMutableSeq: mutable.Seq[T] = { + array + } + + def toImmutableSeq: Seq[T] = { + ArraySeq.unsafeWrapArray(array) + } +} + +object WrappedArraySeq { + + def apply[T](array: Array[T]): WrappedArraySeq[T] = new WrappedArraySeq(array) +} diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv diff --git a/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala b/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala index 54ce33916..85125c04b 100644 --- a/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala @@ -16,7 +16,7 @@ class TestHash extends Test { LexiconNER(kbs, caseInsensitiveMatchings, None) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") diff --git a/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala b/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala index 857f10727..88e2b0726 100644 --- a/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala @@ -16,7 +16,7 @@ class TestHash extends Test { LexiconNER(kbs, caseInsensitiveMatchings, None) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") @@ -34,7 +34,8 @@ class TestHash extends Test { behavior of "Hash" it should "compute the expected equivalence hash for a Document" in { - val expectedHash = 1145238653 + val expectedHash = -1029127286 +// val expectedHash = 1145238653 val actualHash = document.equivalenceHash actualHash should be (expectedHash) @@ -56,7 +57,8 @@ class TestHash extends Test { } it should "compute the expected equivalence hashes for Mentions" in { - val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) + val expectedHashes = Array(-674187334, 1183699787, 391766831, -495035159, -2089326276) +// val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) val actualHashes = allMentions.map(getEquivalenceHash) actualHashes should be (expectedHashes) diff --git a/library/src/test/scala-3/org/clulab/utils/TestHash.scala b/library/src/test/scala-3/org/clulab/utils/TestHash.scala index c1dcf17a8..9186e9ae6 100644 --- a/library/src/test/scala-3/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-3/org/clulab/utils/TestHash.scala @@ -17,7 +17,7 @@ class TestHash extends Test { LexiconNER(kbs, caseInsensitiveMatchings, None) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") @@ -35,7 +35,8 @@ class TestHash extends Test { behavior of "Hash" it should "compute the expected equivalence hash for a Document" in { - val expectedHash = 1145238653 + val expectedHash = -1029127286 +// val expectedHash = 1145238653 val actualHash = document.equivalenceHash actualHash should be (expectedHash) @@ -57,7 +58,8 @@ class TestHash extends Test { } it should "compute the expected equivalence hashes for Mentions" in { - val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) + val expectedHashes = Array(-674187334, 1183699787, 391766831, -495035159, -2089326276) +// val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) val actualHashes = allMentions.map(getEquivalenceHash) actualHashes should be (expectedHashes) diff --git a/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala b/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala index bc22534cd..b5575b1b3 100644 --- a/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala +++ b/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala @@ -8,12 +8,13 @@ class TestEvalTimeNorm extends Test { behavior of "temporal parser" it should "not degrade in performance" in { + val timeNormEvalDir = "/org/clulab/numeric/TimeNormEvalSet" + val testFile = "WorldModelersDatesRangesTimex.csv" + val seasonPath = "/org/clulab/numeric/custom/SEASON.tsv" val expectedFscore = 0.85 - val proc = new BalaurProcessor(seasonPathOpt = Some("/org/clulab/numeric/custom/SEASON.tsv")) - val ner = NumericEntityRecognizer(seasonPath = "/org/clulab/numeric/custom/SEASON.tsv") - val actualFscore = EvalTimeNorm.test(proc, ner) + val proc = new BalaurProcessor(seasonPathOpt = Some(seasonPath)) + val actualFscore = EvalTimeNorm.run(proc, timeNormEvalDir, testFile) + actualFscore should be >= expectedFscore } - } - diff --git a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index 2a9214736..e476f1d43 100644 --- a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -15,7 +15,7 @@ class TestNumericEntityRecognition extends Test { class HabitusTokenizer(tokenizer: Tokenizer) extends Tokenizer(tokenizer.lexer, tokenizer.steps, tokenizer.sentenceSplitter) { // TODO: Make sure en dash is preserved in raw somehow! - override def tokenize(text: String, sentenceSplit: Boolean = true): Array[Sentence] = { + override def tokenize(text: String, sentenceSplit: Boolean = true, characterOffset: Int): Seq[Sentence] = { // Cheat and swap out some en dashes if necessary. val habitusText = if (text.contains(HabitusTokenizer.endash)) @@ -23,7 +23,7 @@ class TestNumericEntityRecognition extends Test { else text - tokenizer.tokenize(habitusText, sentenceSplit) + tokenizer.tokenize(habitusText, sentenceSplit, characterOffset) } } @@ -653,10 +653,10 @@ class TestNumericEntityRecognition extends Test { } /** Runs the actual numeric entity recognizer */ - def numericParse(sentence: String): (Array[String], Array[String], Array[String]) = { + def numericParse(sentence: String): (Seq[String], Seq[String], Seq[String]) = { val doc = proc.annotate(sentence) val mentions = ner.extractFrom(doc) - setLabelsAndNorms(doc, mentions) + NumericUtils.mkLabelsAndNorms(doc, mentions) // assume 1 sentence per doc val sent = doc.sentences.head diff --git a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala index 2bad85f6e..423bd3eb7 100644 --- a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala +++ b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala @@ -13,12 +13,12 @@ class TestSeasonNormalizer extends Test { val fallDateRange = "2017-09-22 -- 2017-12-21" val seasonDateRange = "2017-06-XX -- 2017-10-XX" - def mkEntitiesAndNorms(processor: BalaurProcessor, text: String): (Array[String], Array[String]) = { + def mkEntitiesAndNorms(processor: BalaurProcessor, text: String): (Seq[String], Seq[String]) = { val document = processor.annotate(text) - val mentions = processor.extractNumericEntityMentions(document) + val mentions = processor.numericEntityRecognizerOpt.get.extractFrom(document) - setLabelsAndNorms(document, mentions) - (document.sentences.head.entities.get, document.sentences.head.norms.get) + val (entities, norms) = NumericUtils.mkLabelsAndNorms(document, mentions) + (entities.head, norms.head) } behavior of "Default seasonal BalaurProcessor" diff --git a/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala b/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala index 297346984..b3b477e18 100644 --- a/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala +++ b/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala @@ -9,12 +9,12 @@ class TestNumericPatterns extends Test { val text = "blah" val doc = Document( - Array( + Seq( Sentence( - Array("blah"), - Array(0), - Array(4), - Array("blah") + Seq("blah"), + Seq(0), + Seq(4), + Seq("blah") ) ) ) diff --git a/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala b/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala index 10738826d..3300b791e 100644 --- a/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala +++ b/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala @@ -61,13 +61,13 @@ class TestTokenPattern extends Test { } val text4 = "a b c d e f g h i c" - val tokens = text4.split(" ") + val tokens = text4.split(" ").toSeq val doc = Document( - Array( + Seq( Sentence( tokens, - Array(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), - Array(1, 3, 5, 7, 9, 11, 13, 15, 17, 19), + Seq(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), + Seq(1, 3, 5, 7, 9, 11, 13, 15, 17, 19), tokens ) ) @@ -614,11 +614,11 @@ class TestTokenPattern extends Test { val text8 = "x a a b a b a b a b c d" val doc8 = Document( - Array( + Seq( Sentence( - text8.split(" "), - Array(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), - Array(1, 3, 5, 7, 9, 11, 13, 15, 17, 19) + text8.split(" ").toSeq, + Seq(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), + Seq(1, 3, 5, 7, 9, 11, 13, 15, 17, 19) ) ) ) diff --git a/library/src/test/scala/org/clulab/processors/CluTest.scala b/library/src/test/scala/org/clulab/processors/CluTest.scala index 7b7d323e5..025e71413 100644 --- a/library/src/test/scala/org/clulab/processors/CluTest.scala +++ b/library/src/test/scala/org/clulab/processors/CluTest.scala @@ -29,7 +29,7 @@ class CluTest extends Test with BeforeAndAfterAll { ) val lexiconNer = LexiconNER(kbs, Seq(false), useLemmasForMatching = false) // case sensitive match on this KB - new BalaurProcessor(optionalNER = Some(lexiconNer)) + new BalaurProcessor(lexiconNerOpt = Some(lexiconNer)) } def stop(): Unit = { diff --git a/library/src/test/scala/org/clulab/processors/TestHashTrie.scala b/library/src/test/scala/org/clulab/processors/TestHashTrie.scala index d304713b7..4ec8ee171 100644 --- a/library/src/test/scala/org/clulab/processors/TestHashTrie.scala +++ b/library/src/test/scala/org/clulab/processors/TestHashTrie.scala @@ -19,7 +19,7 @@ class TestHashTrie extends Test { //println("TRIE:\n" + trie) - val tokens = Array("a", "a", "b", "d", "a", "b", "d", "b", "b", "b") + val tokens = Seq("a", "a", "b", "d", "a", "b", "d", "b", "b", "b") val labels = trie.find(tokens, "O") //println("TOKENS: " + tokens.mkString(" ")) //println("LABELS: " + labels.mkString(" ")) @@ -44,7 +44,7 @@ class TestHashTrie extends Test { trie.add(Array("this", "is", "c", "test")) trie.add(Array("this", "is", "b", "test")) - val labels = trie.find(Array("this", "is", "c", "test"), "o") + val labels = trie.find(Seq("this", "is", "c", "test"), "o") sameLabels(Array("B-hello", "I-hello", "I-hello", "I-hello"), labels) } @@ -55,7 +55,7 @@ class TestHashTrie extends Test { trie.add(Array("this", "is", "c", "test")) trie.add(Array("this", "is", "d", "test")) - val labels = trie.find(Array("this", "is", "b", "test"), "o") + val labels = trie.find(Seq("this", "is", "b", "test"), "o") sameLabels(Array("o", "o", "o", "o"), labels) } diff --git a/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala b/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala index 4c2fa4c37..48115479c 100644 --- a/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala +++ b/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala @@ -24,8 +24,8 @@ import scala.util.Using class TestLexiconNER extends CluTest { def mkSentence(text: String): Sentence = { - val doc = proc.mkDocument(text) - proc.annotate(doc) + val simpleDoc = proc.mkDocument(text) + val doc = proc.annotate(simpleDoc) doc.sentences.head } diff --git a/library/src/test/scala/org/clulab/processors/TestProcessor.scala b/library/src/test/scala/org/clulab/processors/TestProcessor.scala index 2f57e439f..e6f1e0d1b 100644 --- a/library/src/test/scala/org/clulab/processors/TestProcessor.scala +++ b/library/src/test/scala/org/clulab/processors/TestProcessor.scala @@ -9,7 +9,6 @@ class TestProcessor extends CluTest { "Processor" should "tokenize raw text correctly" in { val doc = proc.mkDocument("John Doe went to China. There, he visited Beijing.") - doc.clear() doc.sentences(0).words(0) should be ("John") doc.sentences(0).words(1) should be ("Doe") @@ -40,8 +39,8 @@ class TestProcessor extends CluTest { } it should "POS tag correctly" in { - val doc = proc.mkDocument("John Doe went to China. There, he visited Beijing.") - proc.annotate(doc) + val simpleDoc = proc.mkDocument("John Doe went to China. There, he visited Beijing.") + val doc = proc.annotate(simpleDoc) doc.sentences(0).tags.get(0) should be ("NNP") doc.sentences(0).tags.get(1) should be ("NNP") @@ -59,17 +58,16 @@ class TestProcessor extends CluTest { } it should "POS tag parentheses correctly" in { - val doc = proc.mkDocument("This is a test (of parentheses).") - proc.annotate(doc) + val simpleDoc = proc.mkDocument("This is a test (of parentheses).") + val doc = proc.annotate(simpleDoc) doc.sentences(0).tags.get(4) should be ("-LRB-") doc.sentences(0).tags.get(7) should be ("-RRB-") } it should "recognize syntactic chunks correctly" in { - val doc = proc.mkDocument("He reckons the current account deficit will narrow to only 1.8 billion.") - proc.annotate(doc) - doc.clear() + val simpleDoc = proc.mkDocument("He reckons the current account deficit will narrow to only 1.8 billion.") + val doc = proc.annotate(simpleDoc) doc.sentences(0).chunks.get(0) should be ("B-NP") doc.sentences(0).chunks.get(1) should be ("B-VP") @@ -86,9 +84,8 @@ class TestProcessor extends CluTest { } it should "lemmatize text correctly" in { - val doc = proc.mkDocument("John Doe went to the shops.") - proc.annotate(doc) - doc.clear() + val simpleDoc = proc.mkDocument("John Doe went to the shops.") + val doc = proc.annotate(simpleDoc) doc.sentences(0).lemmas.get(0) should be ("john") doc.sentences(0).lemmas.get(2) should be ("go") @@ -112,40 +109,44 @@ class TestProcessor extends CluTest { } it should "parse MWEs correctly" in { - var sent = "Foods such as icecream are tasty." - var doc = proc.mkDocument(sent) - println(s"WORDS: ${doc.sentences.head.words.mkString(", ")}") - - proc.annotate(doc) - println(s"Enhanced universal dependencies for sentence: $sent") - println(doc.sentences.head.universalEnhancedDependencies.get) - - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod_such_as") should be (true) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod") should be (false) - - sent = "There was famine due to drought." - doc = proc.mkDocument(sent) - println(s"WORDS: ${doc.sentences.head.words.mkString(", ")}") - - proc.annotate(doc) - println(s"Enhanced universal dependencies for sentence: $sent") - println(doc.sentences.head.universalEnhancedDependencies.get) - - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod_due_to") should be (true) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 3, "amod") should be (false) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod") should be (false) - - sent = "They ate cake due to hunger." - doc = proc.mkDocument(sent) - println(s"WORDS: ${doc.sentences.head.words.mkString(", ")}") - - proc.annotate(doc) - println(s"Enhanced universal dependencies for sentence: $sent") - println(doc.sentences.head.universalEnhancedDependencies.get) - - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod_due_to") should be (true) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 3, "amod") should be (false) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod") should be (false) + { + val sent = "Foods such as icecream are tasty." + val simpleDoc = proc.mkDocument(sent) + println(s"WORDS: ${simpleDoc.sentences.head.words.mkString(", ")}") + + val doc = proc.annotate(simpleDoc) + println(s"Enhanced universal dependencies for sentence: $sent") + println(doc.sentences.head.universalEnhancedDependencies.get) + + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod_such_as") should be(true) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod") should be(false) + } + { + val sent = "There was famine due to drought." + val simpleDoc = proc.mkDocument(sent) + println(s"WORDS: ${simpleDoc.sentences.head.words.mkString(", ")}") + + val doc = proc.annotate(simpleDoc) + println(s"Enhanced universal dependencies for sentence: $sent") + println(doc.sentences.head.universalEnhancedDependencies.get) + + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod_due_to") should be(true) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 3, "amod") should be(false) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod") should be(false) + } + { + val sent = "They ate cake due to hunger." + val simpleDoc = proc.mkDocument(sent) + println(s"WORDS: ${simpleDoc.sentences.head.words.mkString(", ")}") + + val doc = proc.annotate(simpleDoc) + println(s"Enhanced universal dependencies for sentence: $sent") + println(doc.sentences.head.universalEnhancedDependencies.get) + + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod_due_to") should be(true) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 3, "amod") should be(false) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod") should be(false) + } } it should "parse incomplete sentence without crashing" in { diff --git a/library/src/test/scala/org/clulab/processors/TestTokenizer.scala b/library/src/test/scala/org/clulab/processors/TestTokenizer.scala index c4d67af56..afd2b594d 100644 --- a/library/src/test/scala/org/clulab/processors/TestTokenizer.scala +++ b/library/src/test/scala/org/clulab/processors/TestTokenizer.scala @@ -223,7 +223,7 @@ class TestTokenizer extends Test { } } - def tok(s:String):Array[Sentence] = { + def tok(s: String): Seq[Sentence] = { println(s"Tokenizing text: $s") val t = new OpenDomainEnglishTokenizer(None) val sents = t.tokenize(s) diff --git a/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala b/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala index 5acf466d4..ceabd13f3 100644 --- a/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala +++ b/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala @@ -24,8 +24,8 @@ class TestJSONSerializer extends Test { "A Document with an ID" should "produce json with an \"id\" field" in { - val d = jsonStringToDocument(""" {"sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) - d.id = Some("this-is-an-id") + val id = "this-is-an-id" + val d = jsonStringToDocument(s""" {"id":"$id","sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) (d.jsonAST \ "id") should equal (JString("this-is-an-id")) } @@ -35,8 +35,7 @@ class TestJSONSerializer extends Test { } "A Document with text" should "produce json with a \"text\" field" in { - val d = jsonStringToDocument(""" {"sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) - d.text = Some(text) + val d = jsonStringToDocument(s""" {"text":"$text","sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) (d.jsonAST \ "text") should equal (JString(text)) } @@ -61,11 +60,11 @@ class TestJSONSerializer extends Test { class Scratch(var document: Document) extends JSONSerialization { def jsonAST: JValue = document.jsonAST } - - doc.text = Some("This is a test") // Original failing test requires text + + val docWithText = doc.copy(sentences = doc.sentences, text = Some("This is a test")) val documentSerializer = new DocumentSerializer() - val expectedDocAsJSON = new Scratch(doc).json() - val docSaved = documentSerializer.save(doc, keepText = true) + val expectedDocAsJSON = new Scratch(docWithText).json() + val docSaved = documentSerializer.save(docWithText, keepText = true) val docLoaded = documentSerializer.load(docSaved) val actualDocAsJSON = new Scratch(docLoaded).json() diff --git a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala index b84e337a3..a820e26fa 100644 --- a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala +++ b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala @@ -1,7 +1,6 @@ package org.clulab.struct -import org.clulab.processors.Document -import org.clulab.processors.Sentence +import org.clulab.processors.{Document, DocumentAttachment, Sentence} import org.clulab.serialization.DocumentSerializer import org.clulab.serialization.json._ import org.clulab.struct.test.CaseClass @@ -124,68 +123,76 @@ class TestDocumentAttachment extends Test { // } "Document with TextNameDocumentAttachment" should "serialize as text" in { - val oldDocument = new Document(Array.empty[Sentence]) - - oldDocument.addAttachment(FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + val oldAttachments = Map[String, DocumentAttachment]( + (FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(sentences = Seq.empty[Sentence], attachments = Some(oldAttachments)) val documentSerializer = new DocumentSerializer() val documentString = documentSerializer.save(oldDocument) val newDocument = documentSerializer.load(documentString) - require(newDocument.getAttachment(FIRST_KEY) == oldDocument.getAttachment(FIRST_KEY)) - require(newDocument.getAttachment(MIDDLE_KEY) == oldDocument.getAttachment(MIDDLE_KEY)) - require(newDocument.getAttachment(LAST_KEY) == oldDocument.getAttachment(LAST_KEY)) - require(newDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name == - oldDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name) + val newAttachments = newDocument.attachments.get + + require(newAttachments(FIRST_KEY) == oldAttachments(FIRST_KEY)) + require(newAttachments(MIDDLE_KEY) == oldAttachments(MIDDLE_KEY)) + require(newAttachments(LAST_KEY) == oldAttachments(LAST_KEY)) + require(newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name == + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name) // This one must be avoided. /*require(newDocument == oldDocument)*/ } "Document with ObjectNameDocumentAttachment" should "serialize as text" in { - val oldDocument = new Document(Array.empty[Sentence]) - - oldDocument.addAttachment(FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + val oldAttachments = Map[String, DocumentAttachment]( + (FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(sentences = Seq.empty[Sentence], attachments = Some(oldAttachments)) val documentSerializer = new DocumentSerializer() // This should be a messy string. val documentString = documentSerializer.save(oldDocument) - val newDocument = documentSerializer.load(documentString) - require(newDocument.getAttachment(FIRST_KEY) == oldDocument.getAttachment(FIRST_KEY)) - require(newDocument.getAttachment(MIDDLE_KEY) == oldDocument.getAttachment(MIDDLE_KEY)) - require(newDocument.getAttachment(LAST_KEY) == oldDocument.getAttachment(LAST_KEY)) - require(newDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name == - oldDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name) + val newAttachments = newDocument.attachments.get + + require(newAttachments(FIRST_KEY) == oldAttachments(FIRST_KEY)) + require(newAttachments(MIDDLE_KEY) == oldAttachments(MIDDLE_KEY)) + require(newAttachments(LAST_KEY) == oldAttachments(LAST_KEY)) + require(newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name == + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name) // This one must be avoided. /*require(newDocument == oldDocument)*/ } "Document with TextNameDocumentAttachments" should "serialize as json" in { - val oldDocument = new Document(Array.empty[Sentence]) + val oldAttachments = Map[String, DocumentAttachment]( + (FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(sentences = Seq.empty[Sentence], attachments = Some(oldAttachments)) - oldDocument.addAttachment(FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) // This shouldn't compile. /*oldDocument.addAttachment("wrong", new NameMethodAttachment("name"))*/ val documentString = prettyJson(renderJValue(oldDocument.jsonAST)) - val newDocument: Document = JSONSerializer.toDocument(parseJson(documentString)) - newDocument.getAttachment(FIRST_KEY) should be (oldDocument.getAttachment(FIRST_KEY)) - newDocument.getAttachment(MIDDLE_KEY) should be (oldDocument.getAttachment(MIDDLE_KEY)) - newDocument.getAttachment(LAST_KEY) should be (oldDocument.getAttachment(LAST_KEY)) - newDocument.getAttachment(ALIAS_KEY).asInstanceOf[Option[NameDocumentAttachment]].get.name should be ( - oldDocument.getAttachment(ALIAS_KEY).asInstanceOf[Option[NameDocumentAttachment]].get.name + val newAttachments = newDocument.attachments.get + + newAttachments(FIRST_KEY) should be (oldAttachments(FIRST_KEY)) + newAttachments(MIDDLE_KEY) should be (oldAttachments(MIDDLE_KEY)) + newAttachments(LAST_KEY) should be (oldAttachments(LAST_KEY)) + newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name should be ( + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name ) // This one must be avoided. @@ -193,25 +200,26 @@ class TestDocumentAttachment extends Test { } "Document with ObjectNameDocumentAttachment" should "serialize as json" in { - val oldDocument = new Document(Array.empty[Sentence]) - - oldDocument.addAttachment(FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + val oldAttachments = Map[String, DocumentAttachment]( + (FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(Seq.empty[Sentence], attachments = Some(oldAttachments)) // This should be a messy string. val documentString = prettyJson(renderJValue(oldDocument.jsonAST)) - val newDocument: Document = JSONSerializer.toDocument(parseJson(documentString)) - require(newDocument.getAttachment(FIRST_KEY) == oldDocument.getAttachment(FIRST_KEY)) - require(newDocument.getAttachment(MIDDLE_KEY) == oldDocument.getAttachment(MIDDLE_KEY)) - require(newDocument.getAttachment(LAST_KEY) == oldDocument.getAttachment(LAST_KEY)) - require(newDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name == - oldDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name) + val newAttachments = newDocument.attachments.get + + require(newAttachments(FIRST_KEY) == oldAttachments(FIRST_KEY)) + require(newAttachments(MIDDLE_KEY) == oldAttachments(MIDDLE_KEY)) + require(newAttachments(LAST_KEY) == oldAttachments(LAST_KEY)) + require(newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name == + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name) // This one must be avoided. /*require(newDocument == oldDocument)*/ } } - diff --git a/library/src/test/scala/org/clulab/utils/TestArrayView.scala b/library/src/test/scala/org/clulab/utils/TestArrayView.scala deleted file mode 100644 index 2bfbd08ff..000000000 --- a/library/src/test/scala/org/clulab/utils/TestArrayView.scala +++ /dev/null @@ -1,54 +0,0 @@ -package org.clulab.utils - -class TestArrayView extends Test { - - behavior of "ArrayView" - - it should "work with no offset" in { - val array = Array(1, 2, 3) - val arrayView = MutableArrayView(array) - - array.length should be (arrayView.length) - - arrayView.zip(array).foreach { case (arrayViewItem, arrayItem) => - arrayViewItem should be (arrayItem) - } - - arrayView(0) = 4 - arrayView(0) should be (4) - array(0) should be (4) - } - - it should "work with an offset" in { - val offset = 1 - val array = Array(1, 2, 3) - val arrayView = MutableArrayView(array, offset) - - array.length should be (arrayView.length + offset) - - arrayView.zip(array).foreach { case (arrayViewItem, arrayItem) => - arrayViewItem should be (arrayItem + offset) - } - - arrayView(0) = 4 - arrayView(0) should be (4) - array(1) should be (4) - } - - it should "work when clipped" in { - val offset = 1 - val clip = 1 - val array = Array(1, 2, 3) - val arrayView = MutableArrayView(array, offset, array.length - clip) - - array.length should be (arrayView.length + offset + clip) - - arrayView.zip(array).foreach { case (arrayViewItem, arrayItem) => - arrayViewItem should be (arrayItem + offset) - } - - arrayView(0) = 4 - arrayView(0) should be (4) - array(1) should be (4) - } -} diff --git a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala index 13390e71e..bb9ba3823 100644 --- a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala +++ b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala @@ -6,13 +6,15 @@ import org.clulab.struct.{DirectedGraph, Edge, Interval} class TestFindHeads extends Test { - def newSentence(words: Array[String], directedGraph: DirectedGraph[String]): Sentence = { - val startOffsets = Array(0) // unused - val endOffsets = Array(0) // unused - val sentence = new Sentence(words, startOffsets, endOffsets, words) + def newSentence(words: Seq[String], directedGraph: DirectedGraph[String]): Sentence = { + val startOffsets = Seq(0) // unused + val endOffsets = Seq(0) // unused + val sentence = new Sentence( + words, startOffsets, endOffsets, words, + tags = Some(words), + graphs = Map(UNIVERSAL_BASIC -> directedGraph) + ) - sentence.graphs(UNIVERSAL_BASIC) = directedGraph - sentence.tags = Some(words) sentence } @@ -115,7 +117,7 @@ class TestFindHeads extends Test { val len: Int = 78 val directedGraph = DirectedGraph(edges) val tokenInterval = Interval(0, len) - val words = 1.to(len).map { index => s"word$index" }.toArray + val words = 1.to(len).map { index => s"word$index" } val sentence = newSentence(words, directedGraph) val heads = DependencyUtils.findHeadsStrict(tokenInterval, sentence) diff --git a/project/build.properties b/project/build.properties index 11956d958..75ac47aaa 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1,6 +1,9 @@ +# This was last checked on 2025-06-02. # Version 1.7.2+ will cause problems when combined with the play plug-in used for the webapp! # [error] * org.scala-lang.modules:scala-xml_2.12:2.1.0 (early-semver) is selected over {1.2.0, 1.1.1} # [error] +- org.scala-lang:scala-compiler:2.12.17 (depends on 2.1.0) # [error] +- com.typesafe.sbt:sbt-native-packager:1.5.2 (scalaVersion=2.12, sbtVersion=1.0) (depends on 1.1.1) # [error] +- com.typesafe.play:twirl-api_2.12:1.5.1 (depends on 1.2.0) -sbt.version = 1.7.2 +# This error is solved by adding a VersionScheme.Always to plugins.sbt. +# up to 1.11.1 +sbt.version = 1.11.1 diff --git a/project/plugins.sbt b/project/plugins.sbt index 273ee7ce6..417c04d23 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,3 +1,5 @@ +ThisBuild / libraryDependencySchemes += "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always + // Latest version numbers were updated on 2024 July 11. addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.2-1") // up to 2.2.1 * addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") // up to 3.9.21 * diff --git a/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala b/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala index 9f4691529..14fc5ebb8 100644 --- a/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala +++ b/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala @@ -33,7 +33,7 @@ class HomeController @Inject()(cc: ControllerComponents) extends AbstractControl val kbs = customLexiconNerConfigs.map(_.kb) val caseInsensitiveMatchings = customLexiconNerConfigs.map(_.caseInsensitiveMatching) val customLexiconNer = LexiconNER(kbs, caseInsensitiveMatchings, None) - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) processor } diff --git a/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala b/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala index 0c9bff455..617a4303d 100644 --- a/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala +++ b/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala @@ -14,7 +14,7 @@ class ParseObj(doc: Document) { head + xml.Utility.escape(text) + tail } - def getTdAtOptString(option: Option[Array[String]], n: Int): String = { + def getTdAtOptString(option: Option[Seq[String]], n: Int): String = { val text = if (option.isEmpty) "" else option.get(n) @@ -22,9 +22,9 @@ class ParseObj(doc: Document) { getTd(text) } - def getTdAtString(values: Array[String], n: Int): String = getTd(values(n)) + def getTdAtString(values: Seq[String], n: Int): String = getTd(values(n)) - def getTdAtInt(values: Array[Int], n: Int): String = getTd(values(n).toString, true) + def getTdAtInt(values: Seq[Int], n: Int): String = getTd(values(n).toString, true) def edgesToString(to: Int): String = { val edges = sentence.dependencies.map(_.incomingEdges(to)).getOrElse(Array.empty)