Spaces:

prb977
/

cooccurrence_count

Runtime error

cooccurrence_count / cooccurrence_count.py

Prabin Bhandari

Use list of words

1861797 over 3 years ago

3.79 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current
	# dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Get the co-occurance count for two words in each sentece in a dataset.
	"""


	import evaluate
	import datasets
	from sklearn.feature_extraction.text import CountVectorizer
	import numpy as np
	import stanza


	_DESCRIPTION = """\
	Returns the co-occurrence count of two words in the input.
	"""

	_CITATION = ""

	_KWARGS_DESCRIPTION = """
	Calculates the co-occurence of two words in each sentence.
	Args:
	`data`: a list of `str` which containes a dataset.
	`words`: list of list of two words that we want to check for
	Returns:
	Examples:
	>>> data = ["hello sun","hello moon", "hello sun"]
	>>> c_count = evaluate.load("prb977/cooccurrence_count")
	>>> results = c_count.compute(data=data, words=[['hello','sun']\)
	>>> print(results)
	[['hello','sun',3,2]]
	"""


	def check_count(x):
	if x[0].all() <= 0:
	return 0
	return 1


	nlp = stanza.Pipeline(lang='en', processors='tokenize')


	def stanza_tokenizer(sen):
	doc = nlp(sen)
	tokens = []
	for sen in doc.sentences:
	for token in sen.tokens:
	tokens.append(token.text)
	return tokens


	@evaluate.utils.file_utils.add_start_docstrings(
	_DESCRIPTION,
	_KWARGS_DESCRIPTION
	)
	class CooccurrenceCount(evaluate.Measurement):
	"""This measurement returns the co-occurrence count of two words."""

	def _info(self):
	return evaluate.MeasurementInfo(
	module_type="measurement",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=datasets.Features({
	'data': datasets.Value('string')
	}),
	)

	def _download_and_prepare(self, dl_manager):
	stanza.download('en', processors='tokenize')

	def _compute(self, data, words):
	for each in words:
	word1 = each[0]
	word2 = each[1]
	print(word1)
	print(word2)
	len1 = len(stanza_tokenizer(word1))
	len2 = len(stanza_tokenizer(word2))
	if len1 > len2:
	ugram = len1
	lgram = len2
	elif len1 < len2:
	ugram = len2
	lgram = len1
	else:
	ugram = len1
	lgram = len1

	v = CountVectorizer(
	ngram_range=(lgram, ugram),
	tokenizer=stanza_tokenizer,
	lowercase=True
	)
	analyzer = v.build_analyzer()
	vectorizer = CountVectorizer(
	ngram_range=(lgram, ugram),
	vocabulary={
	analyzer(word1)[-1]: 0,
	analyzer(word2)[-1]: 1
	},
	tokenizer=stanza_tokenizer,
	lowercase=True
	)
	co_occurrences = vectorizer.fit_transform(data)
	dense_mat = co_occurrences.todense()
	count = len(data)
	co_occurrence_count = np.sum(
	np.apply_along_axis(check_count, axis=1, arr=dense_mat)
	)
	each.append(count)
	each.append(co_occurrence_count)
	return words