Spaces:
Runtime error
Runtime error
| import edu.stanford.nlp.ie.AbstractSequenceClassifier; | |
| import edu.stanford.nlp.ie.crf.*; | |
| import edu.stanford.nlp.io.IOUtils; | |
| import edu.stanford.nlp.ling.CoreLabel; | |
| import edu.stanford.nlp.ling.CoreAnnotations; | |
| import edu.stanford.nlp.sequences.DocumentReaderAndWriter; | |
| import edu.stanford.nlp.util.Triple; | |
| import java.util.List; | |
| /** This is a demo of calling CRFClassifier programmatically. | |
| * <p> | |
| * Usage: {@code java -mx400m -cp "*" NERDemo [serializedClassifier [fileName]] } | |
| * <p> | |
| * If arguments aren't specified, they default to | |
| * classifiers/english.all.3class.distsim.crf.ser.gz and some hardcoded sample text. | |
| * If run with arguments, it shows some of the ways to get k-best labelings and | |
| * probabilities out with CRFClassifier. If run without arguments, it shows some of | |
| * the alternative output formats that you can get. | |
| * <p> | |
| * To use CRFClassifier from the command line: | |
| * </p><blockquote> | |
| * {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -textFile [file] } | |
| * </blockquote><p> | |
| * Or if the file is already tokenized and one word per line, perhaps in | |
| * a tab-separated value format with extra columns for part-of-speech tag, | |
| * etc., use the version below (note the 's' instead of the 'x'): | |
| * </p><blockquote> | |
| * {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -testFile [file] } | |
| * </blockquote> | |
| * | |
| * @author Jenny Finkel | |
| * @author Christopher Manning | |
| */ | |
| public class NERDemo { | |
| public static void main(String[] args) throws Exception { | |
| String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; | |
| if (args.length > 0) { | |
| serializedClassifier = args[0]; | |
| } | |
| AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); | |
| /* For either a file to annotate or for the hardcoded text example, this | |
| demo file shows several ways to process the input, for teaching purposes. | |
| */ | |
| if (args.length > 1) { | |
| /* For the file, it shows (1) how to run NER on a String, (2) how | |
| to get the entities in the String with character offsets, and | |
| (3) how to run NER on a whole file (without loading it into a String). | |
| */ | |
| String fileContents = IOUtils.slurpFile(args[1]); | |
| List<List<CoreLabel>> out = classifier.classify(fileContents); | |
| for (List<CoreLabel> sentence : out) { | |
| for (CoreLabel word : sentence) { | |
| System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); | |
| } | |
| System.out.println(); | |
| } | |
| System.out.println("---"); | |
| out = classifier.classifyFile(args[1]); | |
| for (List<CoreLabel> sentence : out) { | |
| for (CoreLabel word : sentence) { | |
| System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); | |
| } | |
| System.out.println(); | |
| } | |
| System.out.println("---"); | |
| List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents); | |
| for (Triple<String, Integer, Integer> item : list) { | |
| System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third())); | |
| } | |
| System.out.println("---"); | |
| System.out.println("Ten best entity labelings"); | |
| DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter(); | |
| classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter); | |
| System.out.println("---"); | |
| System.out.println("Per-token marginalized probabilities"); | |
| classifier.printProbs(args[1], readerAndWriter); | |
| // -- This code prints out the first order (token pair) clique probabilities. | |
| // -- But that output is a bit overwhelming, so we leave it commented out by default. | |
| // System.out.println("---"); | |
| // System.out.println("First Order Clique Probabilities"); | |
| // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter); | |
| } else { | |
| /* For the hard-coded String, it shows how to run it on a single | |
| sentence, and how to do this and produce several formats, including | |
| slash tags and an inline XML output format. It also shows the full | |
| contents of the {@code CoreLabel}s that are constructed by the | |
| classifier. And it shows getting out the probabilities of different | |
| assignments and an n-best list of classifications with probabilities. | |
| */ | |
| String[] example = {"Good afternoon Rajat Raina, how are you today?", | |
| "I go to school at Stanford University, which is located in California." }; | |
| for (String str : example) { | |
| System.out.println(classifier.classifyToString(str)); | |
| } | |
| System.out.println("---"); | |
| for (String str : example) { | |
| // This one puts in spaces and newlines between tokens, so just print not println. | |
| System.out.print(classifier.classifyToString(str, "slashTags", false)); | |
| } | |
| System.out.println("---"); | |
| for (String str : example) { | |
| // This one is best for dealing with the output as a TSV (tab-separated column) file. | |
| // The first column gives entities, the second their classes, and the third the remaining text in a document | |
| System.out.print(classifier.classifyToString(str, "tabbedEntities", false)); | |
| } | |
| System.out.println("---"); | |
| for (String str : example) { | |
| System.out.println(classifier.classifyWithInlineXML(str)); | |
| } | |
| System.out.println("---"); | |
| for (String str : example) { | |
| System.out.println(classifier.classifyToString(str, "xml", true)); | |
| } | |
| System.out.println("---"); | |
| for (String str : example) { | |
| System.out.print(classifier.classifyToString(str, "tsv", false)); | |
| } | |
| System.out.println("---"); | |
| // This gets out entities with character offsets | |
| int j = 0; | |
| for (String str : example) { | |
| j++; | |
| List<Triple<String,Integer,Integer>> triples = classifier.classifyToCharacterOffsets(str); | |
| for (Triple<String,Integer,Integer> trip : triples) { | |
| System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n", | |
| trip.first(), trip.second(), trip.third, j); | |
| } | |
| } | |
| System.out.println("---"); | |
| // This prints out all the details of what is stored for each token | |
| int i=0; | |
| for (String str : example) { | |
| for (List<CoreLabel> lcl : classifier.classify(str)) { | |
| for (CoreLabel cl : lcl) { | |
| System.out.print(i++ + ": "); | |
| System.out.println(cl.toShorterString()); | |
| } | |
| } | |
| } | |
| System.out.println("---"); | |
| } | |
| } | |
| } | |