Spaces:
Runtime error
Runtime error
Prabin Bhandari
commited on
Commit
·
1ea30cb
1
Parent(s):
2c03ef4
Some changes
Browse files- cooccurrence_count.py +26 -4
cooccurrence_count.py
CHANGED
|
@@ -21,6 +21,7 @@ import evaluate
|
|
| 21 |
import datasets
|
| 22 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 23 |
import numpy as np
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
_DESCRIPTION = """\
|
|
@@ -53,6 +54,18 @@ def check_count(x):
|
|
| 53 |
return 1
|
| 54 |
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
@evaluate.utils.file_utils.add_start_docstrings(
|
| 57 |
_DESCRIPTION,
|
| 58 |
_KWARGS_DESCRIPTION
|
|
@@ -71,9 +84,12 @@ class CooccurrenceCount(evaluate.Measurement):
|
|
| 71 |
}),
|
| 72 |
)
|
| 73 |
|
|
|
|
|
|
|
|
|
|
| 74 |
def _compute(self, data, word1, word2):
|
| 75 |
-
len1 = len(word1
|
| 76 |
-
len2 = len(word2
|
| 77 |
if len1 > len2:
|
| 78 |
ugram = len1
|
| 79 |
lgram = len2
|
|
@@ -84,14 +100,20 @@ class CooccurrenceCount(evaluate.Measurement):
|
|
| 84 |
ugram = len1
|
| 85 |
lgram = len1
|
| 86 |
|
| 87 |
-
v = CountVectorizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
analyzer = v.build_analyzer()
|
| 89 |
vectorizer = CountVectorizer(
|
| 90 |
ngram_range=(lgram, ugram),
|
| 91 |
vocabulary={
|
| 92 |
analyzer(word1)[-1]: 0,
|
| 93 |
analyzer(word2)[-1]: 1
|
| 94 |
-
}
|
|
|
|
|
|
|
| 95 |
)
|
| 96 |
co_occurrences = vectorizer.fit_transform(data)
|
| 97 |
dense_mat = co_occurrences.todense()
|
|
|
|
| 21 |
import datasets
|
| 22 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 23 |
import numpy as np
|
| 24 |
+
import stanza
|
| 25 |
|
| 26 |
|
| 27 |
_DESCRIPTION = """\
|
|
|
|
| 54 |
return 1
|
| 55 |
|
| 56 |
|
| 57 |
+
nlp = stanza.Pipeline(lang='en', processors='tokenize')
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def stanza_tokenizer(sen):
|
| 61 |
+
doc = nlp(sen)
|
| 62 |
+
tokens = []
|
| 63 |
+
for sen in doc.sentences:
|
| 64 |
+
for token in sen.tokens:
|
| 65 |
+
tokens.append(token.text)
|
| 66 |
+
return tokens
|
| 67 |
+
|
| 68 |
+
|
| 69 |
@evaluate.utils.file_utils.add_start_docstrings(
|
| 70 |
_DESCRIPTION,
|
| 71 |
_KWARGS_DESCRIPTION
|
|
|
|
| 84 |
}),
|
| 85 |
)
|
| 86 |
|
| 87 |
+
def _download_and_prepare(self, dl_manager):
|
| 88 |
+
stanza.download('en', processors='tokenize')
|
| 89 |
+
|
| 90 |
def _compute(self, data, word1, word2):
|
| 91 |
+
len1 = len(stanza_tokenizer(word1))
|
| 92 |
+
len2 = len(stanza_tokenizer(word2))
|
| 93 |
if len1 > len2:
|
| 94 |
ugram = len1
|
| 95 |
lgram = len2
|
|
|
|
| 100 |
ugram = len1
|
| 101 |
lgram = len1
|
| 102 |
|
| 103 |
+
v = CountVectorizer(
|
| 104 |
+
ngram_range=(lgram, ugram),
|
| 105 |
+
tokenizer=stanza_tokenizer,
|
| 106 |
+
lowercase=True
|
| 107 |
+
)
|
| 108 |
analyzer = v.build_analyzer()
|
| 109 |
vectorizer = CountVectorizer(
|
| 110 |
ngram_range=(lgram, ugram),
|
| 111 |
vocabulary={
|
| 112 |
analyzer(word1)[-1]: 0,
|
| 113 |
analyzer(word2)[-1]: 1
|
| 114 |
+
},
|
| 115 |
+
tokenizer=stanza_tokenizer,
|
| 116 |
+
lowercase=True
|
| 117 |
)
|
| 118 |
co_occurrences = vectorizer.fit_transform(data)
|
| 119 |
dense_mat = co_occurrences.todense()
|