| import spacy |
| from spacy.training import Example |
| import jsonlines |
| import random |
|
|
| |
| nlp = spacy.blank("en") |
|
|
| |
| textcat = nlp.add_pipe('textcat_multilabel', last=True) |
| textcat.add_label("CapitalRequirements") |
| textcat.add_label("ConsumerProtection") |
| textcat.add_label("RiskManagement") |
| textcat.add_label("ReportingAndCompliance") |
| textcat.add_label("CorporateGovernance") |
|
|
| |
| processed_data_file = "data/firstStep_file.jsonl" |
|
|
| |
| with jsonlines.open(processed_data_file) as reader: |
| processed_data = list(reader) |
|
|
| |
| spacy_train_data = [] |
| for obj in processed_data: |
| text = obj["text"] |
| label = { |
| "CapitalRequirements": obj["label"] == "CapitalRequirements", |
| "ConsumerProtection": obj["label"] == "ConsumerProtection", |
| "RiskManagement": obj["label"] == "RiskManagement", |
| "ReportingAndCompliance": obj["label"] == "ReportingAndCompliance", |
| "CorporateGovernance": obj["label"] == "CorporateGovernance" |
| } |
| spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label})) |
|
|
| |
| optimizer = nlp.initialize() |
|
|
| |
| n_iter = 10 |
| for i in range(n_iter): |
| spacy.util.fix_random_seed(1) |
| random.shuffle(spacy_train_data) |
| losses = {} |
| for batch in spacy.util.minibatch(spacy_train_data, size=8): |
| nlp.update(batch, losses=losses, sgd=optimizer) |
| print("Iteration:", i, "Losses:", losses) |
|
|
| |
| output_dir = "./my_trained_model" |
| nlp.to_disk(output_dir) |
|
|