Building a Vespa Standalone Application with Python
Vespa Standalone Application with Python
We will explore how to set up a standalone Vespa application using Python. Vespa is a powerful engine for large-scale data processing and serving, suitable for implementing fast and scalable search applications. We will walk through the process of installing the necessary dependencies, configuring the application package, deploying it, and performing various types of searches. Let's dive in!
Step 1: Install Dependencies
To get started, you need to install the required Python packages. We'll use pyvespa for interacting with Vespa, vespacli for command-line utilities, and datasets for loading datasets. Open your terminal and run the following commands:
bashCopy
pip install pyvespa vespacli
pip install datasetsStep 2: Load Dependencies
Once the necessary packages are installed, we can load them in our Python script. These libraries will help us define our Vespa application, handle document operations, and interact with the Vespa cloud.
pythonCopy
from vespa.package import (
ApplicationPackage,
Field,
Schema,
Document,
HNSW,
RankProfile,
Component,
Parameter,
FieldSet,
GlobalPhaseRanking,
Function
)
from vespa.deployment import VespaCloud
from vespa.io import VespaResponse, VespaQueryResponse
import os
import pandas as pd
from datasets import load_dataset
import jsonStep 3: Load Dataset and Convert it to Vespa Format
For this example, we will use a dataset from Hugging Face's datasets library. We'll convert the dataset into a format suitable for Vespa, which requires mapping data into documents with specific fields.
pythonCopy
# Load the dataset
dataset = load_dataset("BeIR/nfcorpus", "corpus", split="corpus", streaming=False)
# Convert dataset to Vespa format
vespa_feed = dataset.map(
lambda x: {
"id": x["_id"],
"fields": {"title": x["title"], "body": x["text"], "id": x["_id"]}
}
)Step 4: Configure and Create Application Package
Next, we define the configuration for our Vespa application. This includes setting up fields, schemas, and rank profiles.
pythonCopy
# Configuration parameters
tenant_name = "your_tenant_name"
application = "your_application_name"
# Create application package
package = ApplicationPackage(
name=application,
schema=[
Schema(
name="doc",
document=Document(
fields=[
Field(name="id", type="string", indexing=["summary"]),
Field(name="title", type="string", indexing=["index", "summary"], index="enable-bm25"),
Field(name="body", type="string", indexing=["index", "summary"], index="enable-bm25", bolding=True),
Field(
name="embedding",
type="tensor<float>(x[384])",
indexing=['input title . " " . input body', "embed", "index", "attribute"],
ann=HNSW(distance_metric="angular"),
is_document_field=False,
),
]
),
fieldsets=[FieldSet(name="default", fields=["title", "body"])],
rank_profiles=[
RankProfile(
name="bm25",
inputs=[("query(q)", "tensor<float>(x[384])")],
functions=[
Function(name="bm25sum", expression="bm25(title) + bm25(body)")
],
first_phase="bm25sum",
),
RankProfile(
name="semantic",
inputs=[("query(q)", "tensor<float>(x[384])")],
first_phase="closeness(field, embedding)",
),
RankProfile(
name="fusion",
inherits="bm25",
inputs=[("query(q)", "tensor<float>(x[384])")],
first_phase="closeness(field, embedding)",
global_phase=GlobalPhaseRanking(
expression="reciprocal_rank_fusion(bm25sum, closeness(field, embedding))",
rerank_count=1000,
),
),
],
)
],
components=[
Component(
id="e5",
type="hugging-face-embedder",
parameters=[
Parameter(
"transformer-model",
{
"url": "https://github.com/vespa-engine/sample-apps/raw/master/simple-semantic-search/model/e5-small-v2-int8.onnx"
},
),
Parameter(
"tokenizer-model",
{
"url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/simple-semantic-search/model/tokenizer.json"
},
),
],
)
],
)Step 5: Deploy Application to Vespa Cloud
After configuring the application package, we deploy it to the Vespa cloud.
pythonCopy
# Deploy app to Vespa cloud
vespa_cloud = VespaCloud(
tenant=tenant_name,
application=application,
application_package=package,
)
app = vespa_cloud.deploy()
# Get the endpoint
endpoint = vespa_cloud.get_mtls_endpoint()Step 6: Feed Documents to Vespa
Now we can feed our prepared data into the Vespa application.
pythonCopy
# Feed documents
def callback(response: VespaResponse, id: str):
if not response.is_successful():
print(f"Error when feeding document {id}: {response.get_json()}")
app.feed_iterable(vespa_feed, schema="doc", namespace="tutorial", callback=callback)Step 7: Query Vespa
Vespa allows for various types of search queries. Here are some examples:
Plain Keyword Search
pythonCopy
with app.syncio(connections=1) as session:
query = "How Fruits and Vegetables Can Treat Asthma?"
response = session.query(
yql="select * from sources * where userQuery() limit 5",
query=query,
ranking="bm25",
)
if response.is_successful():
print(response.hits)Plain Semantic Search
pythonCopy
with app.syncio(connections=1) as session:
query = "How Fruits and Vegetables Can Treat Asthma?"
response = session.query(
yql="select * from sources * where ({targetHits:5}nearestNeighbor(embedding,q)) limit 5",
query=query,
ranking="semantic",
body={"input.query(q)": f"embed({query})"},
)
if response.is_successful():
print(response.hits)Hybrid Search
pythonCopy
with app.syncio(connections=1) as session:
query = "How Fruits and Vegetables Can Treat Asthma?"
response = session.query(
yql="select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q)) limit 5",
query=query,
ranking="fusion",
body={"input.query(q)": f"embed({query})"},
)
if response.is_successful():
print(response.hits)Step 8: Document Operations
Finally, you can perform document operations such as retrieving, updating, and deleting documents.
pythonCopy
import requests
session = requests.Session()
session.cert = (app.cert, app.key)
# Get a document
url = f"{endpoint}/document/v1/tutorial/doc/docid/MED-10"
doc = session.get(url).json()
# Update the document
doc["fields"]["title"] = "Can you eat lobster?"
response = session.post(url, json=doc).json()
# Verify the update
updated_doc = session.get(url).json()Conclusion
In this post, we've covered the essential steps for setting up a standalone Vespa application using Python. From installing dependencies to deploying the application and performing various searches, Vespa provides robust capabilities for building scalable search applications. Happy coding!
Last updated