关键词搜索引擎系统,vps虚拟电脑,河南郑州vps,新加坡原生住宅vps,VPS主机搭建性能监控软件

🗣 Wd提问: 关键词搜索引擎系统

🤖 Ai回答:
这是一个完整的、可直接部署的基于倒排索引的Python关键词搜索引擎系统。它包含了核心索引构建、搜索算法、评分排序以及一个简单的命令行界面。

系统架构

1、文档预处理 (`TextProcessor`类): 负责分词、停用词过滤、词干提取。
2、索引构建 (`InvertedIndex`类): 构建倒排索引，存储词项到文档ID和词频的映射。
3、搜索与排序 (`SearchEngine`类): 执行查询，计算相关度分数（TF-IDF），并返回排序后的结果。
4、用户界面 (`main`): 简单的命令行交互循环。

完整代码 (`search_engine.py`)

python
import re
import string
from collections import defaultdict, Counter
import math
import pickle
import os

class TextProcessor:
文本预处理类，负责分词、清洗、停用词过滤和词干提取。

def __init__(self):
一个简单的英文停用词列表
self.stop_words = set([
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if',
'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 'such', 'that',
'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was',
'will', 'with', 'i', 'he', 'she', 'we', 'you', 'me', 'him', 'her', 'us'
])
用于分词和去除标点的正则表达式模式
self.pattern = re.compile(rf'[^\w\s]|[\d_]', re.UNICODE)

def preprocess(self, text):
预处理文本：转为小写、去除标点和数字、分词、移除停用词、词干提取（简化版）。
if not text:
return []
1、转为小写
text = text.lower()
2、移除标点符号和数字
text = self.pattern.sub(' ', text)
3、分词
words = text.split()
4、移除停用词并应用简单的词干提取（这里只是去掉简单的复数's'，实际应用应使用PorterStemmer等库）
filtered_words = []
for word in words:
if word not in self.stop_words and len(word) 1:
极其简单的词干还原：去掉末尾的's'
if word.endswith('s'):
word = word[:-1]
filtered_words.append(word)
return filtered_words

class InvertedIndex:
倒排索引类，负责构建和保存倒排索引。

def __init__(self):
倒排索引：{term: {doc_id: term_frequency}}
self.index = defaultdict(dict)
文档词数统计：{doc_id: total_terms}
self.doc_lengths = {}
文档总数
self.num_documents = 0

def build_index(self, documents):
构建倒排索引。

Args:
documents: 一个列表，每个元素是一个元组 (doc_id, document_text)

processor = TextProcessor()
self.num_documents = len(documents)

for doc_id, text in documents:
预处理文档内容，得到词项列表
terms = processor.preprocess(text)
更新文档长度（词数）
self.doc_lengths[doc_id] = len(terms)

统计词项在当前文档中的频率
term_freq = Counter(terms)

将词项和频率更新到倒排索引中
for term, freq in term_freq.items():
if doc_id not in self.index[term]:
self.index[term][doc_id] = 0
self.index[term][doc_id] += freq

def get_document_frequency(self, term):
获取词项的词文档频率（DF），即包含该词项的文档数量。
if term in self.index:
return len(self.index[term])
return 0

def save(self, filepath):
将索引保存到磁盘。
with open(filepath, 'wb') as f:
pickle.dump((dict(self.index), self.doc_lengths, self.num_documents), f)

def load(self, filepath):
从磁盘加载索引。
if os.path.exists(filepath):
with open(filepath, 'rb') as f:
data = pickle.load(f)
self.index = defaultdict(dict, data[0])
self.doc_lengths = data[1]
self.num_documents = data[2]
return True
return False

class SearchEngine:
搜索引擎核心类，负责执行查询和排序。

def __init__(self, index):
self.index = index
self.processor = TextProcessor()

def search(self, query, top_k=10):
执行搜索查询，返回最相关的top_k个文档ID及其分数。

使用TF-IDF加权和余弦相似度进行排序。

1、预处理查询词
query_terms = self.processor.preprocess(query)
if not query_terms:
return []

2、获取包含所有查询词的候选文档集合（求交集）
为了处理部分词可能不存在的情况，使用集合操作
candidate_docs = set()
for term in query_terms:
if term in self.index.index:
candidate_docs.update(self.index.index[term].keys())

如果没有候选文档，直接返回空列表
if not candidate_docs:
return []

3、为每个候选文档计算TF-IDF分数（针对查询向量和文档向量）
scores = defaultdict(float)

计算查询向量中每个词的TF-IDF权重（这里查询的TF始终为1）
query_vector = {}
for term in query_terms:
df = self.index.get_document_frequency(term)
if df 0:
idf = math.log(self.index.num_documents / (df + 1)) + 1 平滑并避免负值
query_vector[term] = 1 * idf TF for query is 1 for each term

归一化查询向量
query_norm = math.sqrt(sum([w * w for w in query_vector.values()]))

for doc_id in candidate_docs:
doc_vector = {}
doc_norm = 0.0
doc_length = self.index.doc_lengths[doc_id]

计算文档向量中每个查询词的TF-IDF
for term in query_terms:
if term in self.index.index and doc_id in self.index.index[term]:
tf = self.index.index[term][doc_id]
简单的TF计算：词频 / 文档总词数 (用于余弦相似度)
normalized_tf = tf / doc_length
df = self.index.get_document_frequency(term)
idf = math.log(self.index.num_documents / (df + 1)) + 1
weight = normalized_tf * idf
doc_vector[term] = weight
doc_norm += weight * weight
else:
doc_vector[term] = 0.0

doc_norm = math.sqrt(doc_norm)

计算余弦相似度：点积 / (查询向量模 * 文档向量模)
dot_product = 0.0
for term in query_terms:
dot_product += query_vector.get(term, 0) * doc_vector.get(term, 0)

if query_norm * doc_norm 0:
cosine_similarity = dot_product / (query_norm * doc_norm)
scores[doc_id] = cosine_similarity

4、按分数降序排序，返回前top_k个结果
sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return sorted_results[:top_k]

def main():
主函数，提供简单的命令行交互界面。

初始化索引和搜索引擎
index = InvertedIndex()
engine = SearchEngine(index)
index_file = 'search_index.pkl'

尝试从磁盘加载索引
if index.load(index_file):
print(f成功从 '{index_file}' 加载索引。)
print(f索引中包含 {index.num_documents} 个文档。)
else:
print(f索引文件 '{index_file}' 不存在，需要重新构建。)
print(请输入文档数据（输入格式：'文档ID,文档内容'），输入 'done' 结束：)
documents = []
while True:
user_input = input().strip()
if user_input.lower() == 'done':
break
简单的输入解析，假设用户输入 id,text
parts = user_input.split(',', 1)
if len(parts) == 2:
doc_id = parts[0].strip()
text = parts[1].strip

Ai作答

📣 商家广告：