DagimB
/

ecfr-textcat

machine learning

natural language processing

Model card Files Files and versions

ecfr-textcat / python_Code /secondStep-score.py

DagimB's picture

Upload 78 files

49f0c5b verified almost 2 years ago

history blame contribute delete

1.71 kB

	import spacy
	from spacy.training import Example
	import jsonlines
	import random

	# Load a blank English model
	nlp = spacy.blank("en")

	# Add text classification pipeline to the model
	textcat = nlp.add_pipe('textcat_multilabel', last=True)
	textcat.add_label("CapitalRequirements")
	textcat.add_label("ConsumerProtection")
	textcat.add_label("RiskManagement")
	textcat.add_label("ReportingAndCompliance")
	textcat.add_label("CorporateGovernance")

	# Path to the processed data file
	processed_data_file = "data/firstStep_file.jsonl"

	# Open the JSONL file and extract text and labels
	with jsonlines.open(processed_data_file) as reader:
	processed_data = list(reader)

	# Convert processed data to spaCy format
	spacy_train_data = []
	for obj in processed_data:
	text = obj["text"]
	label = {
	"CapitalRequirements": obj["label"] == "CapitalRequirements",
	"ConsumerProtection": obj["label"] == "ConsumerProtection",
	"RiskManagement": obj["label"] == "RiskManagement",
	"ReportingAndCompliance": obj["label"] == "ReportingAndCompliance",
	"CorporateGovernance": obj["label"] == "CorporateGovernance"
	}
	spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label}))

	# Initialize the model and get the optimizer
	optimizer = nlp.initialize()

	# Train the text classification model
	n_iter = 10
	for i in range(n_iter):
	spacy.util.fix_random_seed(1)
	random.shuffle(spacy_train_data)
	losses = {}
	for batch in spacy.util.minibatch(spacy_train_data, size=8):
	nlp.update(batch, losses=losses, sgd=optimizer)
	print("Iteration:", i, "Losses:", losses)

	# Save the trained model
	output_dir = "./my_trained_model"
	nlp.to_disk(output_dir)