pgvector
Recipe: Implement semantic search using PostgreSQL vector similarity.
Soy supports pgvector operators for embedding-based similarity search.
Setup
Install the pgvector extension:
CREATE EXTENSION IF NOT EXISTS vector;
Define a model with a vector column:
type Document struct {
ID int `db:"id" type:"serial" constraints:"primary key"`
Title string `db:"title" type:"text"`
Content string `db:"content" type:"text"`
Embedding []float32 `db:"embedding" type:"vector(1536)"` // OpenAI dimensions
}
Create the soy instance:
documents, err := soy.New[Document](db, "documents")
Distance Operators
pgvector provides four distance measures:
| Operator | Name | Use Case |
|---|---|---|
<-> | L2 (Euclidean) | General similarity |
<#> | Inner product | Normalized vectors |
<=> | Cosine | Text embeddings |
<+> | L1 (Manhattan) | Sparse vectors |
Basic Similarity Search
Find documents similar to a query embedding:
func SearchDocuments(ctx context.Context, queryEmbedding []float32, limit int) ([]*Document, error) {
return documents.Query().
OrderByExpr("embedding", "<=>", "query_vec", "asc").
Limit(limit).
Exec(ctx, map[string]any{
"query_vec": queryEmbedding,
})
}
The OrderByExpr method generates:
SELECT * FROM documents
ORDER BY embedding <=> :query_vec ASC
LIMIT :limit
Retrieving Distance Scores
Use SelectExpr to include computed distance values in results:
func SearchWithScores(ctx context.Context, queryEmbedding []float32, limit int) ([]*Document, error) {
return documents.Query().
SelectExpr("embedding", "<=>", "query_vec", "score").
OrderByExpr("embedding", "<=>", "query_vec", "asc").
Limit(limit).
Exec(ctx, map[string]any{
"query_vec": queryEmbedding,
})
}
This generates:
SELECT *, "embedding" <=> :query_vec AS "score"
FROM documents
ORDER BY "embedding" <=> :query_vec ASC
LIMIT :limit
Note: The score column is added to the result set. To access it, you can either:
- Add a
Scorefield to your struct with thedb:"score"tag - Use
ExecAtomto get results as a map
Filtered Search
Combine similarity with conditions:
func SearchByCategory(ctx context.Context, embedding []float32, category string, limit int) ([]*Document, error) {
return documents.Query().
Where("category", "=", "category").
Where("published", "=", "is_published").
OrderByExpr("embedding", "<=>", "query_vec", "asc").
Limit(limit).
Exec(ctx, map[string]any{
"query_vec": embedding,
"category": category,
"is_published": true,
})
}
Distance Threshold
Filter by maximum distance:
func SearchWithinDistance(ctx context.Context, embedding []float32, maxDistance float64) ([]*Document, error) {
return documents.Query().
Where("embedding", "<=>", "query_vec"). // This adds distance to WHERE
OrderByExpr("embedding", "<=>", "query_vec", "asc").
Limit(100).
Exec(ctx, map[string]any{
"query_vec": embedding,
"max_distance": maxDistance,
})
}
Note: For distance thresholds, you may need a raw WHERE clause or post-filtering.
With Pagination
Paginate similarity results:
type SearchResult struct {
Documents []*Document
HasMore bool
}
func SearchPaginated(ctx context.Context, embedding []float32, page, perPage int) (*SearchResult, error) {
offset := (page - 1) * perPage
limit := perPage + 1 // Fetch one extra
results, err := documents.Query().
OrderByExpr("embedding", "<=>", "query_vec", "asc").
Limit(limit).
Offset(offset).
Exec(ctx, map[string]any{"query_vec": embedding})
if err != nil {
return nil, err
}
hasMore := len(results) > perPage
if hasMore {
results = results[:perPage]
}
return &SearchResult{
Documents: results,
HasMore: hasMore,
}, nil
}
Hybrid Search
Combine keyword and semantic search:
func HybridSearch(ctx context.Context, query string, embedding []float32) ([]*Document, error) {
// Semantic results
semanticResults, err := documents.Query().
OrderByExpr("embedding", "<=>", "query_vec", "asc").
Limit(50).
Exec(ctx, map[string]any{"query_vec": embedding})
if err != nil {
return nil, err
}
// Keyword results
keywordResults, err := documents.Query().
Where("content", "ILIKE", "search_pattern").
Limit(50).
Exec(ctx, map[string]any{
"search_pattern": "%" + query + "%",
})
if err != nil {
return nil, err
}
// Merge and deduplicate (implement based on your needs)
return mergeResults(semanticResults, keywordResults), nil
}
From Spec
Build vector queries from specs:
spec := soy.QuerySpec{
Fields: []string{"id", "title", "content"},
Where: []soy.ConditionSpec{
{Field: "category", Operator: "=", Param: "category"},
},
OrderBy: []soy.OrderBySpec{
{Expression: "embedding <=> :query_vec", Direction: "asc"},
},
Limit: intPtr(10),
}
results, err := documents.QueryFromSpec(spec).Exec(ctx, map[string]any{
"category": "technology",
"query_vec": embedding,
})
Indexing
Create an index for efficient similarity search:
-- HNSW index (recommended for most cases)
CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops);
-- IVFFlat index (faster builds, slightly lower recall)
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
Choose the index based on distance operator:
| Operator | Index Ops Class |
|---|---|
<-> | vector_l2_ops |
<#> | vector_ip_ops |
<=> | vector_cosine_ops |
Embedding Generation
Generate embeddings before insert:
func CreateDocument(ctx context.Context, title, content string) (*Document, error) {
// Generate embedding (example with OpenAI)
embedding, err := generateEmbedding(content)
if err != nil {
return nil, err
}
doc := &Document{
Title: title,
Content: content,
Embedding: embedding,
}
return documents.Insert().Build().Exec(ctx, doc)
}
func generateEmbedding(text string) ([]float32, error) {
// Call your embedding API (OpenAI, Cohere, etc.)
// Returns []float32 with appropriate dimensions
}
Complete Example
package search
import (
"context"
"github.com/jmoiron/sqlx"
"github.com/zoobz-io/soy"
)
type Document struct {
ID int `db:"id" type:"serial" constraints:"primary key"`
Title string `db:"title" type:"text" constraints:"not null"`
Content string `db:"content" type:"text"`
Category string `db:"category" type:"text"`
Embedding []float32 `db:"embedding" type:"vector(1536)"`
}
type SearchService struct {
documents *soy.Soy[Document]
embedder Embedder
}
type Embedder interface {
Embed(text string) ([]float32, error)
}
func NewSearchService(db *sqlx.DB, embedder Embedder) (*SearchService, error) {
docs, err := soy.New[Document](db, "documents")
if err != nil {
return nil, err
}
return &SearchService{documents: docs, embedder: embedder}, nil
}
func (s *SearchService) Search(ctx context.Context, query string, opts SearchOptions) ([]*Document, error) {
embedding, err := s.embedder.Embed(query)
if err != nil {
return nil, err
}
q := s.documents.Query()
if opts.Category != "" {
q = q.Where("category", "=", "category")
}
// Use SelectExpr to include distance score in results
if opts.IncludeScore {
q = q.SelectExpr("embedding", "<=>", "query_vec", "score")
}
return q.
OrderByExpr("embedding", "<=>", "query_vec", "asc").
Limit(opts.Limit).
Exec(ctx, map[string]any{
"query_vec": embedding,
"category": opts.Category,
})
}
type SearchOptions struct {
Category string
Limit int
IncludeScore bool
}