CogComp · danyaljj · Dec 12, 2014 · Dec 12, 2014 · Dec 22, 2014 · Dec 31, 2014
diff --git a/README.md b/README.md
@@ -32,7 +32,8 @@ An application that identifies the part of speech (e.g. verb + tense, noun + num
 in plain text.
  - [illinois-ner](ner/README.md)
 An application that identifies named entities in plain text according to two different sets of categories. 
-
+ - [illinois-srl](srl/README.md)
+An application to annotate natural language sentences with semantic roles. 
 
 ## Using each library programmatically 
 

diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/io/caches/TextAnnotationCache.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/io/caches/TextAnnotationCache.java
@@ -26,4 +26,6 @@ public interface TextAnnotationCache {
  boolean contains(TextAnnotation ta);
 
  void removeTextAnnotation(TextAnnotation ta);
+
+ void close();
 }
diff --git a/...ilities/src/main/java/edu/illinois/cs/cogcomp/core/io/caches/TextAnnotationDBHandler.java b/...ilities/src/main/java/edu/illinois/cs/cogcomp/core/io/caches/TextAnnotationDBHandler.java
@@ -276,4 +276,7 @@ public void removeTextAnnotation(TextAnnotation ta) {
  throw new RuntimeException(e);
  }
  }
+
+ @Override
+ public void close() {}
 }
diff --git a/...ties/src/main/java/edu/illinois/cs/cogcomp/core/io/caches/TextAnnotationMapDBHandler.java b/...ties/src/main/java/edu/illinois/cs/cogcomp/core/io/caches/TextAnnotationMapDBHandler.java
@@ -11,7 +11,6 @@
 import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
 import edu.illinois.cs.cogcomp.core.io.IOUtils;
 import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper;
-import org.jetbrains.annotations.NotNull;
 import org.mapdb.DB;
 import org.mapdb.DBMaker;
 import org.mapdb.Serializer;
@@ -127,7 +126,6 @@ private ConcurrentMap<Integer, byte[]> getMap(String dataset) {
  }
 
  @SuppressWarnings("ConstantConditions")
- @NotNull
  private Iterable<String> getAllDatasets() {
  ReentrantReadWriteLock.ReadLock lock = db.getLock$mapdb().readLock();
  lock.tryLock();

diff --git a/pom.xml b/pom.xml
@@ -18,6 +18,7 @@
  <module>pos</module>
  <module>chunker</module>
  <module>ner</module>
+ <module>srl</module>
  <module>corpusreaders</module>
  <module>lbjava-nlp-tools</module>
  <module>big-data-utils</module>

diff --git a/srl/CHANGELOG b/srl/CHANGELOG
@@ -0,0 +1,55 @@
+Version 3.0.73
+Moved to the super-project and changed the versioning to the super-project versioning
+
+Version 5.1.12
+Added Windows support (including access to non-Gurobi solver)
+
+Version 5.1.4
+Switched entirely to illinois-sl for structured prediction (removed JLIS traces)
+Using the latest AnnotatorService from illinois-core-utilities for both Curator & pipeline annotation
+Major cleaning up
+
+Version 5.1
+Added JUnit tests
+Removed unnecessary dependencies
+Switched to illinois-nlp-pipeline-0.1.2
+Minor fixes
+
+Version 5.0
+Standalone SRL using illinois-nlp-pipeline
+
+Version 4.1.1
+Switched to edison-0.7.1 and LBJava-1.0
+Added dependency to illinois-common-resources
+
+Version 4.1
+Various bugfixes
+
+Version 4.0.2
+Updated inference dependency to latest version and modified inference
+code accordingly.
+
+Version 4.0.1
+Removed duplicate code from JLIS-core and moved to IllinoisSL. Minor edits.
+
+Version 4.0
+A complete rewrite of the SRL. Includes predicate and sense detectors,
+new constraints and a memory footprint of only 3GB.
+
+Version 3.0.3
+Minor bugfixes. Uses edison v0.2.9
+
+Version 3.0.2
+Added an option to trim leading prepositions from arguments.
+
+Revamped the training mechanism to train using LBJ's BatchTrainer in
+the code. This allows manual lexicon handling, which reduces the
+memory requirements by nearly 40 percent.
+
+Version 3.0.1
+Minor bugfix
+
+Version 3.0
+A complete Java based re-implementation of the Illinois SRL from
+Punyakanok 2008. This version uses LBJ to train classifiers and
+for performing inference with a home-brewed beam search.
diff --git a/srl/README.md b/srl/README.md
@@ -0,0 +1,49 @@
+# illinois-srl: Semantic Role Labeler
+
+### Running
+You can use the **illinois-srl** system in either *interactive* or *annotator* mode.
+#### Interactive mode
+In *interactive mode* the user can input a single piece of text and get back the feedback from both 
+the **Nom**inal or **Verb**al SRL systems in plain text. 
+
+To run the system in *interactive mode* see the class `edu.illinois.cs.cogcomp.srl.SemanticRoleLabeler`
+or simply execute the `run-interactive` script: 
+
+For linux:
+```
+scripts/run-interactive.sh
+```
+
+For windows:
+```
+cd scripts
+run-interactive-win.bat
+```
+
+#### As an `Annotator` component
+**illinois-srl** can also be used programmatically through the `SemanticRoleLabeler` class which implements CogComp's
+[Annotator interface](http://cogcomp.cs.illinois.edu/software/doc/illinois-core-utilities/apidocs/edu/illinois/cs/cogcomp/core/datastructures/textannotation/Annotator.html).
+
+The main method is `getView(TextAnnotation)` inside `SemanticRoleLabeler`. This will add a new 
+[`PredicateArgumentView`](http://cogcomp.cs.illinois.edu/software/doc/illinois-core-utilities/apidocs/edu/illinois/cs/cogcomp/core/datastructures/textannotation/PredicateArgumentView.html)
+for either **Nom**inal or **Verb**al SRL. 
+
+### Training
+To train the SRL system you will require access to the [Propbank](https://verbs.colorado.edu/~mpalmer/projects/ace.html)
+or [Nombank](http://nlp.cs.nyu.edu/meyers/NomBank.html) corpora. You need to set pointers to these in the 
+`config/srl-config.properties` file.
+(To train the system with a non-Prop/Nombank corpus, you need to extend 
+[`AbstractSRLAnnotationReader`](http://cogcomp.cs.illinois.edu/software/doc/illinois-core-utilities/apidocs/edu/illinois/cs/cogcomp/nlp/corpusreaders/AbstractSRLAnnotationReader.html))
+
+To perform the whole training/testing suite, run the `Main` class with parameters `<config-file> expt Verb|Nom true`.
+This will:
+
+1. Read and cache the datasets (train/test)
+2. Annotate each `TextAnnotation` with the required views
+ (here you can set the `useCurator` flag to false to use the CogComp's standalone NLP pipeline) 
+3. Pre-extract and cache the features for the classifiers
+4. Train the classifiers
+5. Evaluate on the (cached) test corpus
+
+**IMPORTANT** After training, make sure you comment-out the pre-trained SRL model dependencies inside 
+`pom.xml` (lines 27-38). 
diff --git a/srl/config/learner.properties b/srl/config/learner.properties
@@ -0,0 +1,20 @@
+# Available learning models: {L2LossSSVM, StructuredPerceptron}
+LEARNING_MODEL = L2LossSSVM
+
+# Available solver types: {DCDSolver, ParallelDCDSolver, DEMIParallelDCDSolver}
+L2_LOSS_SSVM_SOLVER_TYPE = ParallelDCDSolver
+
+NUMBER_OF_THREADS = 8
+
+# Regularization parameter
+C_FOR_STRUCTURE = 1.0
+
+# Mini-batch for 'warm' start
+TRAINMINI = true
+TRAINMINI_SIZE = 10000
+
+# Suppress optimatility check
+CHECK_INFERENCE_OPT = false
+
+# Number of training rounds
+MAX_NUM_ITER = 100
diff --git a/srl/config/pipeline.properties b/srl/config/pipeline.properties
@@ -0,0 +1,17 @@
+## Flags for whether to use different annotators
+usePos = true
+useLemma = true
+useShallowParse = true
+useNerConll = true
+useNerOntonotes = false
+useStanfordParse = true
+useStanfordDep = true
+useSrlVerb = false
+useSrlNom = false
+
+## Flags for the Stanford parser (for pre-processing)
+# Max time per sentence (in milliseconds)
+stanfordMaxTimePerSentence = 1000
+
+# Max sentence lenght (will throw exception if larger)
+stanfordParseMaxSentenceLength = 80
diff --git a/srl/config/srl-config.properties b/srl/config/srl-config.properties
@@ -0,0 +1,43 @@
+## Illinois SRL Configuration##
+
+# Whether to use the Illinois Curator to get the required annotations for training/testing
+# If set to false, Illinois NLP pipeline will be used
+UseCurator = false
+
+# The configuration of the Illinois NLP pipeline
+PipelineConfig = config/pipeline.properties
+
+# The parser used to extract constituents and syntactic features
+# Options are: Charniak, Berkeley, Stanford
+# NB: Only Stanford can be used in standalone mode.
+DefaultParser = Stanford
+
+# The configuration for the Structured learner
+LearnerConfig = config/learner.properties
+
+# Num of threads for feat. ext.
+NumFeatExtThreads = 10
+
+# The ILP solver to use for the joint inference
+# Options are: Gurobi, OJAlgo
+ILPSolver = OJAlgo
+
+# The TextAnnotation caching mechanism to use
+# Options are: MapDB, H2
+DatasetCache = MapDB
+
+### Training corpora directories ###
+# This is the directory of the merged (mrg) WSJ files
+PennTreebankHome = /shared/corpora/corporaWeb/treebanks/eng/pennTreebank/treebank-3/parsed/mrg/wsj/
+PropbankHome = /shared/corpora/corporaWeb/treebanks/eng/propbank_1/data
+NombankHome = /shared/corpora/corporaWeb/treebanks/eng/nombank/
+
+# The directory of the sentence and pre-extracted features database (~5G of space required)
+# Not used during test/working with pre-trained models
+CacheDirectory = cache
+
+ModelsDirectory = models
+
+# Directory to output gold and predicted files for manual comparison
+# Comment out for no output
+OutputDirectory = srl-out
diff --git a/srl/pom.xml b/srl/pom.xml
@@ -0,0 +1,133 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <parent>
+ <artifactId>illinois-cogcomp-nlp</artifactId>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <version>3.0.77</version>
+ </parent>
+
+ <modelVersion>4.0.0</modelVersion>
+ <artifactId>illinois-srl</artifactId>
+ <packaging>jar</packaging>
+ <url>http://cogcomp.cs.illinois.edu</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+ <cogcomp-nlp-pipeline-version>0.1.24</cogcomp-nlp-pipeline-version>
+ </properties>
+
+ <dependencies>
+ <!-- Include the pre-trained SRL models for running SemanticRoleLabeler -->
+ <!-- Notice that the models need to match up to the minor version number -->
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-srl-models</artifactId>
+ <classifier>verb-stanford</classifier>
+ <version>5.1</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-srl-models</artifactId>
+ <classifier>nom-stanford</classifier>
+ <version>5.1</version>
+ </dependency>
+
+ <!--The Illinois pipeline can be used instead -->
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-nlp-pipeline</artifactId>
+ <version>${cogcomp-nlp-pipeline-version}</version>
+ <exclusions>
+ <exclusion>
+ <artifactId>illinois-srl</artifactId>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <!-- The following 3 projects are now developed under illinois-cogcomp-nlp -->
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-core-utilities</artifactId>
+ <version>3.0.77</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-curator</artifactId>
+ <version>3.0.77</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-edison</artifactId>
+ <version>3.0.77</version>
+ </dependency>
+ <dependency>
+ <groupId>com.gurobi</groupId>
+ <artifactId>gurobi</artifactId>
+ <version>6.5</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-common-resources</artifactId>
+ <classifier>illinoisSRL</classifier>
+ <version>1.5</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-common-resources</artifactId>
+ <version>1.5</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-common-resources</artifactId>
+ <classifier>ner</classifier>
+ <version>1.5</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-sl-core</artifactId>
+ <version>1.0.3</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.6</version>
+ </dependency>
+ <dependency>
+ <groupId>com.h2database</groupId>
+ <artifactId>h2</artifactId>
+ <version>1.4.190</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.illinois.cs.cogcomp</groupId>
+ <artifactId>illinois-inference</artifactId>
+ <version>0.6.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.tartarus</groupId>
+ <artifactId>snowball</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.12</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <reporting>
+ <excludeDefaults>true</excludeDefaults>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-javadoc-plugin</artifactId>
+ <version>2.10.3</version>
+ </plugin>
+ </plugins>
+ </reporting>
+
+</project>