在阿里的灵积模型中嵌入垂直领域知识库
发表于:2024-04-21 | 分类: AI

在阿里的灵积模型中嵌入垂直领域知识库

灵积模型api调用

1
pip install dashscope

你的api key建议直接加入环境变量中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from http import HTTPStatus
import dashscope

def sample_sync_call():
prompt_text = '用萝卜、土豆、茄子做饭,给我个菜谱。'
resp = dashscope.Generation.call(
model='qwen-turbo',
prompt=prompt_text
)
# The response status_code is HTTPStatus.OK indicate success,
# otherwise indicate request is failed, you can get error code
# and message from code and message.
if resp.status_code == HTTPStatus.OK:
print(resp.output) # The output text
print(resp.usage) # The usage information
else:
print(resp.code) # The error code.
print(resp.message) # The error message.

sample_sync_call()

embedding

  1. 本地知识库的向量化。通过文本向量模型将其转化为高质量低维度的向量数据,再写入DashVector向量检索服务。这里数据的向量化我们采用了灵积模型服务上的Embedding API实现。

  2. 相关知识点的提取。将提问文本向量化后,通过DashVector提取相关知识点的原文。

  3. 构造Prompt进行提问。将相关知识点作为“限定上下文+提问” 一起作为prompt询问通义千问。

20240421035739

本地知识库的向量化

1
pip install dashvector

dashscope.api_key,dashvector的api_key和endpoint需要写上你自己的,加载预料处的路径也改为你数据集的目录路径。batch_size根据自己需求调整。我这里用的数据集是从“人人都是产品经理”上爬下来的文章。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os

import dashscope
from dashscope import TextEmbedding
import dashvector
from dashvector import Client, Doc

def prepare_data(path, batch_size=25):
batch_docs = []
for file in os.listdir(path):
with open(path + '/' + file, 'r', encoding='utf-8') as f:
batch_docs.append(f.read())
if len(batch_docs) == batch_size:
yield batch_docs
batch_docs = []

if batch_docs:
yield batch_docs


def generate_embeddings(news):
rsp = TextEmbedding.call(
model=TextEmbedding.Models.text_embedding_v1,
input=news
)
embeddings = [record['embedding'] for record in rsp.output['embeddings']]
return embeddings if isinstance(news, list) else embeddings[0]


if __name__ == '__main__':
dashscope.api_key = ''

# 初始化 client
client = dashvector.Client(
api_key='',
endpoint=''
)
assert client

# 创建集合:指定集合名称和向量维度, text_embedding_v1 模型产生的向量统一为 1536 维
rsp = client.create('product', 1536)
assert rsp

# 加载语料
id = 0
collection = client.get('product')
for news in list(prepare_data('knowledgedata/PM-data-master/woshipm.com')):
print(news)
ids = [id + i for i, _ in enumerate(news)]

id += len(news)

vectors = generate_embeddings(news)
# 写入 dashvector 构建索引
rsp = collection.upsert(
[
Doc(id=str(id), vector=vector, fields={"raw": doc})
for id, vector, doc in zip(ids, vectors, news)
]
)
assert rsp

知识点的提取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import dashvector
from dashvector import Client
from dashvector import Doc
from embedding import generate_embeddings


def search_relevant_news(question):
# 初始化 dashvector client
client = dashvector.Client(
api_key='',
endpoint=''
)
assert client

# 获取刚刚存入的集合
collection = client.get('product')
assert collection

# 向量检索:指定 topk = 1
rsp = collection.query(generate_embeddings(question), output_fields=['raw'],topk=1)
assert rsp
return rsp.output[0].fields['raw']


question = input("请输入问题:")
print(search_relevant_news(question))

构造Prompt进行提问

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import dashscope
from http import HTTPStatus
import os
import webbrowser

import dashvector
from dashvector import Client
from dashvector import Doc
from embedding import generate_embeddings

def generate_response(prompt):
response_generator = dashscope.Generation.call(
model='chatglm3-6b',
prompt=prompt,
stream=True,
top_p=0.8)

head_idx = 0
print("\n")
for resp in response_generator:
paragraph = resp.output['text']
# print("\r%s" % paragraph[head_idx:len(paragraph)], end='')
if(paragraph.rfind('\n') != -1):
head_idx = paragraph.rfind('\n') + 1

if resp.status_code == HTTPStatus.OK:
return resp.output.text # The output text
print(resp.usage) # The usage information
else:
print(resp.code) # The error code.
print(resp.message) # The error message.

def search_relevant_news(question):
# 初始化 dashvector client
client = dashvector.Client(
api_key='',
endpoint=''
)
assert client

# 获取刚刚存入的集合
collection = client.get('product')
assert collection

# 向量检索:指定 topk = 1
rsp = collection.query(generate_embeddings(question), output_fields=['raw'],topk=1)
assert rsp
return rsp.output[0].fields['raw']

# 初始化对话历史列表
dialogue_history = []

while True:
# 获取用户输入的对话内容
origin_user_input = input("\n\n请描述您的问题:")

# 结束循环条件:用户未输入新对话
if not origin_user_input:
break

# 添加知识库参考
user_input = search_relevant_news(origin_user_input) + "\n请参考以上资料回答下列问题\n" + origin_user_input
# print(user_input)

# 将用户对话内容添加到历史记录中
dialogue_history.append(user_input)

# 构建包含对话历史的新提示
prompt_text = "\n".join(dialogue_history)

# 使用新的对话上下文生成回复
generated_response = generate_response(prompt_text)

print(generated_response)

# 获取当前工作目录
current_dir = os.getcwd()

with open('answer.txt', 'w', encoding='utf-8') as f:
f.write(generated_response)

# 循环结束后,程序会在这里结束

如此一来,大模型就会根据你知识库的内容来回答你的问题

提问:需求分析怎么做

这是通过dashvector从知识库中检索到的内容
20240421042410

这是构造prompt后的对话
20240421042239

很明显大模型参考了知识库,然后根据问题回答

上一篇:
部署minecraft服务器并添加mods
下一篇:
搭建国内可访问的ai网页端