Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ dependencies = [
"jieba-next",
"python-pinyin",
"pyxdg",
"stopwordsiso",
"setuptools", # req by stopwordsiso, https://stackoverflow.com/a/39930983/4799273
"wcwidth",
"wordsegment",
# CUSTOM DEPENDENCIES END
Expand Down
4 changes: 2 additions & 2 deletions src/sphinxnotes/snippet/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ def extract_excerpt(s: Snippet) -> str:
def extract_keywords(s: Snippet) -> list[str]:
keywords = [s.docname]
if isinstance(s, WithTitle) and s.title is not None:
keywords.extend(extractor.extract(s.title, strip_stopwords=False))
keywords.extend(extractor.extract(s.title))
if isinstance(s, Code):
keywords.extend(extractor.extract(s.desc, strip_stopwords=False))
keywords.extend(extractor.extract(s.desc))
return keywords


Expand Down
16 changes: 1 addition & 15 deletions src/sphinxnotes/snippet/keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def __init__(self):
from langid import rank
from jieba_next import cut_for_search, setLogLevel
from pypinyin import lazy_pinyin
from stopwordsiso import stopwords
from wordsegment import load, segment

# Turn off jieba debug log.
Expand All @@ -38,16 +37,13 @@ def __init__(self):
self._tokenize_zh_cn = cut_for_search
self._tokenize_en = segment
self._pinyin = lazy_pinyin
self._stopwords = stopwords

self._punctuation = (
string.punctuation
+ '!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.·'
)

def extract(
self, text: str, top_n: int | None = None, strip_stopwords: bool = True
) -> list[str]:
def extract(self, text: str, top_n: int | None = None) -> list[str]:
"""Return keywords of given text."""
# TODO: zh -> en
# Normalize
Expand All @@ -57,8 +53,6 @@ def extract(
# Invalid token removal
words = self.strip_invalid_token(words)
# Stopwords removal
if strip_stopwords:
words = self.strip_stopwords(words)
if top_n:
# Get top n words as keyword
keywords = Counter(words).most_common(top_n)
Expand Down Expand Up @@ -106,13 +100,5 @@ def tokenize(self, text: str) -> list[str]:
def trans_to_pinyin(self, word: str) -> str | None:
return ' '.join(self._pinyin(word, errors='ignore'))

def strip_stopwords(self, words: list[str]) -> list[str]:
stw = self._stopwords(['en', 'zh'])
new_words = []
for word in words:
if word not in stw:
new_words.append(word)
return new_words

def strip_invalid_token(self, tokens: list[str]) -> list[str]:
return [token for token in tokens if token != '']