def preprocess(self, text):
预处理文本:转为小写、去除标点和数字、分词、移除停用词、词干提取(简化版)。
if not text:
return []
1、转为小写
text = text.lower()
2、移除标点符号和数字
text = self.pattern.sub(' ', text)
3、分词
words = text.split()
4、移除停用词并应用简单的词干提取(这里只是去掉简单的复数's',实际应用应使用PorterStemmer等库)
filtered_words = []
for word in words:
if word not in self.stop_words and len(word) 1:
极其简单的词干还原:去掉末尾的's'
if word.endswith('s'):
word = word[:-1]
filtered_words.append(word)
return filtered_words
1、预处理查询词
query_terms = self.processor.preprocess(query)
if not query_terms:
return []
2、获取包含所有查询词的候选文档集合(求交集)
为了处理部分词可能不存在的情况,使用集合操作
candidate_docs = set()
for term in query_terms:
if term in self.index.index:
candidate_docs.update(self.index.index[term].keys())
计算查询向量中每个词的TF-IDF权重(这里查询的TF始终为1)
query_vector = {}
for term in query_terms:
df = self.index.get_document_frequency(term)
if df 0:
idf = math.log(self.index.num_documents / (df + 1)) + 1 平滑并避免负值
query_vector[term] = 1 * idf TF for query is 1 for each term
归一化查询向量
query_norm = math.sqrt(sum([w * w for w in query_vector.values()]))
for doc_id in candidate_docs:
doc_vector = {}
doc_norm = 0.0
doc_length = self.index.doc_lengths[doc_id]
计算文档向量中每个查询词的TF-IDF
for term in query_terms:
if term in self.index.index and doc_id in self.index.index[term]:
tf = self.index.index[term][doc_id]
简单的TF计算:词频 / 文档总词数 (用于余弦相似度)
normalized_tf = tf / doc_length
df = self.index.get_document_frequency(term)
idf = math.log(self.index.num_documents / (df + 1)) + 1
weight = normalized_tf * idf
doc_vector[term] = weight
doc_norm += weight * weight
else:
doc_vector[term] = 0.0
doc_norm = math.sqrt(doc_norm)
计算余弦相似度:点积 / (查询向量模 * 文档向量模)
dot_product = 0.0
for term in query_terms:
dot_product += query_vector.get(term, 0) * doc_vector.get(term, 0)