Spaces:
Paused
Paused
| # this is the configuration file for the GROBID instance that uses the Deep Learning Models. | |
| grobid: | |
| # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change | |
| grobidHome: "/opt/grobid/grobid-home" | |
| # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp) | |
| temp: "tmp" | |
| # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib) | |
| nativelibrary: "lib" | |
| pdf: | |
| pdfalto: | |
| # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally | |
| path: "pdfalto" | |
| # security for PDF parsing | |
| memoryLimitMb: 6096 | |
| timeoutSec: 120 | |
| # security relative to the PDF parsing result | |
| blocksMax: 200000 | |
| tokensMax: 1000000 | |
| consolidation: | |
| # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or | |
| # "glutton" for https://github.com/kermitt2/biblio-glutton | |
| # service: "crossref" | |
| service: "glutton" | |
| glutton: | |
| url: "http://sciencialab.ddns.net:8080" | |
| # url: "http://localhost:8080" | |
| crossref: | |
| mailto: | |
| # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g. | |
| #mailto: "[email protected]" | |
| token: | |
| # to use Crossref metadata plus service (available by subscription) | |
| #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere" | |
| proxy: | |
| # proxy to be used when doing external call to the consolidation service | |
| host: | |
| port: | |
| # CORS configuration for the GROBID web API service | |
| corsAllowedOrigins: "*" | |
| corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD" | |
| corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin" | |
| # the actual implementation for language recognition to be used | |
| languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory" | |
| # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP) | |
| #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory" | |
| sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory" | |
| # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities | |
| # for a production server running only GROBID, set the value slightly above the available number of threads of the server | |
| # to get best performance and security | |
| concurrency: 10 | |
| # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try | |
| # to get an engine (in seconds) - normally never change it | |
| poolMaxWait: 1 | |
| delft: | |
| # DeLFT global parameters | |
| # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model, | |
| # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used) | |
| install: "/opt/delft" | |
| pythonVirtualEnv: | |
| wapiti: | |
| # Wapiti global parameters | |
| # number of threads for training the wapiti models (0 to use all available processors) | |
| nbThreads: 0 | |
| models: | |
| # we configure here how each sequence labeling model should be implemented | |
| # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations | |
| # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training | |
| # parameters then depends on this selected DL architecture | |
| - name: "segmentation" | |
| # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation | |
| engine: "wapiti" | |
| #engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.0000001 | |
| window: 50 | |
| nbMaxIterations: 2000 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| useELMo: false | |
| runtime: | |
| # parameters used at runtime/prediction | |
| max_sequence_length: 3000 | |
| batch_size: 1 | |
| training: | |
| # parameters used for training | |
| max_sequence_length: 3000 | |
| batch_size: 10 | |
| - name: "segmentation-article-light" | |
| engine: "wapiti" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.0000001 | |
| window: 50 | |
| nbMaxIterations: 2000 | |
| - name: "segmentation-article-light-ref" | |
| engine: "wapiti" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.0000001 | |
| window: 50 | |
| nbMaxIterations: 2000 | |
| - name: "fulltext" | |
| # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation | |
| engine: "wapiti" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.0001 | |
| window: 20 | |
| nbMaxIterations: 1500 | |
| - name: "header" | |
| engine: "wapiti" | |
| # engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.000001 | |
| window: 30 | |
| nbMaxIterations: 1500 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_ChainCRF_FEATURES" | |
| #transformer: "allenai/scibert_scivocab_cased" | |
| useELMo: false | |
| runtime: | |
| # parameters used at runtime/prediction | |
| #max_sequence_length: 510 | |
| max_sequence_length: 3000 | |
| batch_size: 1 | |
| training: | |
| # parameters used for training | |
| #max_sequence_length: 510 | |
| #batch_size: 6 | |
| max_sequence_length: 3000 | |
| batch_size: 9 | |
| - name: "header-article-light" | |
| engine: "wapiti" | |
| # engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.000001 | |
| window: 30 | |
| nbMaxIterations: 1500 | |
| delft: | |
| architecture: "BidLSTM_ChainCRF_FEATURES" | |
| useELMo: false | |
| - name: "header-article-light-ref" | |
| engine: "wapiti" | |
| # engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.000001 | |
| window: 30 | |
| nbMaxIterations: 1500 | |
| delft: | |
| architecture: "BidLSTM_ChainCRF_FEATURES" | |
| useELMo: false | |
| - name: "reference-segmenter" | |
| engine: "wapiti" | |
| # engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.00001 | |
| window: 20 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_ChainCRF_FEATURES" | |
| useELMo: false | |
| runtime: | |
| # parameters used at runtime/prediction (for this model, use same max_sequence_length as training) | |
| max_sequence_length: 3000 | |
| batch_size: 2 | |
| training: | |
| # parameters used for training | |
| max_sequence_length: 3000 | |
| batch_size: 10 | |
| - name: "name-header" | |
| engine: "wapiti" | |
| #engine: "delft" | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| - name: "name-citation" | |
| engine: "wapiti" | |
| #engine: "delft" | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| - name: "date" | |
| engine: "wapiti" | |
| #engine: "delft" | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| - name: "figure" | |
| engine: "wapiti" | |
| #engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.00001 | |
| window: 20 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF" | |
| - name: "table" | |
| engine: "wapiti" | |
| #engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.00001 | |
| window: 20 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF" | |
| - name: "affiliation-address" | |
| engine: "wapiti" | |
| # engine: "delft" | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| - name: "citation" | |
| engine: "wapiti" | |
| # engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.00001 | |
| window: 50 | |
| nbMaxIterations: 3000 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| #architecture: "BERT_CRF" | |
| #transformer: "michiyasunaga/LinkBERT-base" | |
| useELMo: false | |
| runtime: | |
| # parameters used at runtime/prediction | |
| max_sequence_length: 500 | |
| batch_size: 30 | |
| training: | |
| # parameters used for training | |
| max_sequence_length: 500 | |
| batch_size: 50 | |
| - name: "patent-citation" | |
| engine: "wapiti" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.0001 | |
| window: 20 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| #architecture: "BERT_CRF" | |
| runtime: | |
| # parameters used at runtime/prediction | |
| max_sequence_length: 800 | |
| batch_size: 20 | |
| training: | |
| # parameters used for training | |
| max_sequence_length: 1000 | |
| batch_size: 40 | |
| - name: "funding-acknowledgement" | |
| engine: "wapiti" | |
| # engine: "delft" | |
| wapiti: | |
| # wapiti training parameters, they will be used at training time only | |
| epsilon: 0.00001 | |
| window: 50 | |
| nbMaxIterations: 2000 | |
| delft: | |
| # deep learning parameters | |
| architecture: "BidLSTM_CRF_FEATURES" | |
| #architecture: "BERT_CRF" | |
| #transformer: "michiyasunaga/LinkBERT-base" | |
| useELMo: false | |
| runtime: | |
| # parameters used at runtime/prediction | |
| max_sequence_length: 800 | |
| batch_size: 20 | |
| training: | |
| # parameters used for training | |
| max_sequence_length: 500 | |
| batch_size: 40 | |
| - name: "copyright" | |
| # at this time, we only have a DeLFT implementation, | |
| # use "wapiti" if the deep learning library JNI is not available and model will then be ignored | |
| # engine: "delft" | |
| engine: "wapiti" | |
| delft: | |
| # deep learning parameters | |
| architecture: "gru" | |
| #architecture: "bert" | |
| #transformer: "allenai/scibert_scivocab_cased" | |
| - name: "license" | |
| # at this time, for being active, it must be DeLFT, no other implementation is available | |
| # use "wapiti" if the deep learning library JNI is not available and model will then be ignored | |
| # engine: "delft" | |
| engine: "wapiti" | |
| delft: | |
| # deep learning parameters | |
| architecture: "gru" | |
| #architecture: "bert" | |
| #transformer: "allenai/scibert_scivocab_cased" | |
| # for **service only**: how to load the models, | |
| # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down | |
| # significantly the service at first call | |
| # true -> all the models are loaded into memory at the server startup (default), slow the start of the services | |
| # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready | |
| modelPreload: true | |
| server: | |
| type: custom | |
| applicationConnectors: | |
| - type: http | |
| port: 8070 | |
| adminConnectors: | |
| - type: http | |
| port: 8071 | |
| registerDefaultExceptionMappers: false | |
| # change the following for having all http requests logged | |
| requestLog: | |
| appenders: [] | |
| # these logging settings apply to the Grobid service usage mode | |
| logging: | |
| level: INFO | |
| loggers: | |
| org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" | |
| org.glassfish.jersey.internal: "OFF" | |
| com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" | |
| appenders: | |
| - type: console | |
| threshold: WARN | |
| timeZone: UTC | |
| # uncomment to have the logs in json format | |
| #layout: | |
| # type: json |