Skip to content

Commit 209b6d3

Browse files
committed
chore: Drop stopwords related logic
1 parent fa840f9 commit 209b6d3

File tree

3 files changed

+3
-19
lines changed

3 files changed

+3
-19
lines changed

pyproject.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ dependencies = [
4848
"jieba-next",
4949
"python-pinyin",
5050
"pyxdg",
51-
"stopwordsiso",
52-
"setuptools", # req by stopwordsiso, https://stackoverflow.com/a/39930983/4799273
5351
"wcwidth",
5452
"wordsegment",
5553
# CUSTOM DEPENDENCIES END

src/sphinxnotes/snippet/ext.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@ def extract_excerpt(s: Snippet) -> str:
6363
def extract_keywords(s: Snippet) -> list[str]:
6464
keywords = [s.docname]
6565
if isinstance(s, WithTitle) and s.title is not None:
66-
keywords.extend(extractor.extract(s.title, strip_stopwords=False))
66+
keywords.extend(extractor.extract(s.title))
6767
if isinstance(s, Code):
68-
keywords.extend(extractor.extract(s.desc, strip_stopwords=False))
68+
keywords.extend(extractor.extract(s.desc))
6969
return keywords
7070

7171

src/sphinxnotes/snippet/keyword.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ def __init__(self):
2626
from langid import rank
2727
from jieba_next import cut_for_search, setLogLevel
2828
from pypinyin import lazy_pinyin
29-
from stopwordsiso import stopwords
3029
from wordsegment import load, segment
3130

3231
# Turn off jieba debug log.
@@ -38,16 +37,13 @@ def __init__(self):
3837
self._tokenize_zh_cn = cut_for_search
3938
self._tokenize_en = segment
4039
self._pinyin = lazy_pinyin
41-
self._stopwords = stopwords
4240

4341
self._punctuation = (
4442
string.punctuation
4543
+ '!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.·'
4644
)
4745

48-
def extract(
49-
self, text: str, top_n: int | None = None, strip_stopwords: bool = True
50-
) -> list[str]:
46+
def extract(self, text: str, top_n: int | None = None) -> list[str]:
5147
"""Return keywords of given text."""
5248
# TODO: zh -> en
5349
# Normalize
@@ -57,8 +53,6 @@ def extract(
5753
# Invalid token removal
5854
words = self.strip_invalid_token(words)
5955
# Stopwords removal
60-
if strip_stopwords:
61-
words = self.strip_stopwords(words)
6256
if top_n:
6357
# Get top n words as keyword
6458
keywords = Counter(words).most_common(top_n)
@@ -106,13 +100,5 @@ def tokenize(self, text: str) -> list[str]:
106100
def trans_to_pinyin(self, word: str) -> str | None:
107101
return ' '.join(self._pinyin(word, errors='ignore'))
108102

109-
def strip_stopwords(self, words: list[str]) -> list[str]:
110-
stw = self._stopwords(['en', 'zh'])
111-
new_words = []
112-
for word in words:
113-
if word not in stw:
114-
new_words.append(word)
115-
return new_words
116-
117103
def strip_invalid_token(self, tokens: list[str]) -> list[str]:
118104
return [token for token in tokens if token != '']

0 commit comments

Comments
 (0)