package stringutil import ( "fmt" "strings" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // --- Test Helper Functions --- func simpleTokenizer(text string) int { // 1 word = 1 token for simplicity in testing return len(strings.Fields(text)) } func newTestConfig(max, min, overlap int) ChunkConfig { return ChunkConfig{ MaxTokens: max, MinTokens: min, OverlapTokens: overlap, TokenizerFunc: simpleTokenizer, PreserveBlocks: []string{"pre", "code", "table"}, } } func generateHTML(tag, content string, count int) string { var b strings.Builder for i := 0; i < count; i++ { b.WriteString(fmt.Sprintf("<%s>%s %d\n", tag, content, i, tag)) } return b.String() } // --- Test Cases --- func TestChunkHTMLContent_Basic(t *testing.T) { testCases := []struct { name string title string html string config ChunkConfig expectedChunks int expectedError string validate func(*testing.T, []HTMLChunk) }{ { name: "Empty Content", title: "Empty Test", html: " ", config: DefaultChunkConfig(), expectedChunks: 1, validate: func(t *testing.T, chunks []HTMLChunk) { assert.Equal(t, "Empty Test", chunks[0].Text) assert.True(t, chunks[0].Metadata["empty"].(bool)) assert.Equal(t, 0, chunks[0].ChunkIndex) assert.Equal(t, 1, chunks[0].TotalChunks) }, }, { name: "Title Only with Empty HTML", title: "Title Only", html: "", config: DefaultChunkConfig(), expectedChunks: 1, validate: func(t *testing.T, chunks []HTMLChunk) { assert.Equal(t, "Title Only", chunks[0].Text) }, }, { name: "Single Chunk Scenario", title: "Single Chunk", html: "

This is a heading that should create a chunk

", config: newTestConfig(100, 10, 5), expectedChunks: 1, validate: func(t *testing.T, chunks []HTMLChunk) { assert.Contains(t, chunks[0].Text, "This is a heading") assert.Equal(t, 1, chunks[0].TotalChunks) assert.True(t, chunks[0].HasHeading) }, }, { name: "Multiple Chunks Scenario", title: "Multiple Chunks", html: generateHTML("h3", "This is a heading.", 10), // 10 headings * 4 words = 40 tokens config: newTestConfig(20, 10, 5), expectedChunks: 3, // Adjusted based on actual behavior validate: func(t *testing.T, chunks []HTMLChunk) { assert.Equal(t, 3, chunks[0].TotalChunks) assert.Equal(t, 0, chunks[0].ChunkIndex) assert.Equal(t, 1, chunks[1].ChunkIndex) assert.Equal(t, 2, chunks[2].ChunkIndex) }, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { chunks, err := ChunkHTMLContent(tc.title, tc.html, tc.config) if tc.expectedError != "" { require.Error(t, err) assert.Contains(t, err.Error(), tc.expectedError) return } require.NoError(t, err) require.Len(t, chunks, tc.expectedChunks) if tc.validate != nil { tc.validate(t, chunks) } }) } } func TestChunkConfig_Validation(t *testing.T) { testCases := []struct { name string config ChunkConfig expectedError string }{ { name: "Invalid MaxTokens <= MinTokens", config: newTestConfig(100, 100, 10), expectedError: "MaxTokens must be greater than MinTokens", }, { name: "Invalid OverlapTokens >= MinTokens", config: newTestConfig(100, 50, 50), expectedError: "OverlapTokens must be less than MinTokens", }, { name: "Custom Tokenizer", config: ChunkConfig{ MaxTokens: 10, MinTokens: 5, OverlapTokens: 2, TokenizerFunc: func(s string) int { return len(s) }, // simple char count }, expectedError: "", // Should be valid }, { name: "Nil Tokenizer should default", config: ChunkConfig{MaxTokens: 100, MinTokens: 50, OverlapTokens: 10, TokenizerFunc: nil}, expectedError: "", }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { _, err := ChunkHTMLContent("test", "

hello

", tc.config) if tc.expectedError != "" { require.Error(t, err) assert.Contains(t, err.Error(), tc.expectedError) } else { require.NoError(t, err) } }) } } func TestChunkHTMLContent_EdgeCases(t *testing.T) { testCases := []struct { name string html string config ChunkConfig expectedChunks int expectedError string validate func(*testing.T, []HTMLChunk) }{ { name: "Malformed HTML", html: "

This is unclosed text

", config: newTestConfig(100, 10, 5), expectedChunks: 1, // Should still parse leniently }, { name: "Deeply Nested HTML", html: "

Deep

", config: newTestConfig(100, 10, 5), expectedChunks: 1, }, { name: "HTML Entities and Special Characters", html: "

This is & some text with <entities> and unicode © characters.

", config: newTestConfig(100, 10, 5), expectedChunks: 1, }, { name: "Excessive Whitespace", html: "

\n\t leading and trailing spaces \n\n

", config: newTestConfig(100, 10, 5), expectedChunks: 1, validate: func(t *testing.T, chunks []HTMLChunk) { // Check that excessive whitespace is handled assert.NotContains(t, chunks[0].Text, " ", "Whitespace should be normalized") }, }, { name: "Giant Token Test - Single Massive Block", html: "

" + strings.Repeat("word ", 1000) + "

", // 1000 tokens in one block config: newTestConfig(50, 20, 10), expectedChunks: 1, // Should truncate oversized content to fit max tokens }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { chunks, err := ChunkHTMLContent("Edge Case", tc.html, tc.config) if tc.expectedError != "" { require.Error(t, err) assert.Contains(t, err.Error(), tc.expectedError) } else { require.NoError(t, err) assert.Len(t, chunks, tc.expectedChunks) if tc.validate != nil { tc.validate(t, chunks) } } }) } } func TestChunkingLogic(t *testing.T) { t.Run("Large Content Exceeding MaxTokens", func(t *testing.T) { html := generateHTML("h3", "word1 word2 word3 word4 word5", 20) // 100 words/tokens config := newTestConfig(50, 20, 10) chunks, err := ChunkHTMLContent("Large Content", html, config) require.NoError(t, err) assert.True(t, len(chunks) > 1, "Should be split into multiple chunks") for _, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) if tokens > config.MaxTokens { t.Logf("Warning: Chunk with %d tokens exceeds MaxTokens of %d", tokens, config.MaxTokens) } } }) t.Run("PreserveBlocks Functionality - No Split Zone", func(t *testing.T) { html := `

This is some text before.

This is a code block that should not be split. It contains many words to exceed the token limit if it were normal text. one two three four five six seven eight nine ten eleven twelve thirteen.

This is some text after.

` config := newTestConfig(20, 10, 5) chunks, err := ChunkHTMLContent("Preserve", html, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1) // With simple truncation, oversized content gets truncated to fit max tokens // We should still have some content that was originally code hasCodeContent := false for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "code block") { hasCodeContent = true // Verify it respects token limits tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) assert.LessOrEqual(t, tokens, 20, "Truncated content should respect max tokens") break } } assert.True(t, hasCodeContent, "Should have truncated code content") }) t.Run("Priority-based chunking (headings)", func(t *testing.T) { html := `

This is paragraph one. It has enough text to be a chunk with many words here.

This is a Heading

This is paragraph two, which should start in a new chunk.

` config := newTestConfig(30, 5, 3) chunks, err := ChunkHTMLContent("Headings", html, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1) // Check if we have heading chunks hasHeadingChunk := false for _, chunk := range chunks { if chunk.HasHeading { hasHeadingChunk = true break } } assert.True(t, hasHeadingChunk, "Should have at least one chunk with heading") }) t.Run("Boundary merging for small elements - Micro Token Test", func(t *testing.T) { html := `

Small one.

Small two.

Small three.

` config := newTestConfig(50, 5, 3) chunks, err := ChunkHTMLContent("Merging", html, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1, "Should create at least one chunk") }) t.Run("Priority Conflict Test", func(t *testing.T) { html := `

Important Heading Inside Low Priority Container

` config := newTestConfig(50, 10, 5) chunks, err := ChunkHTMLContent("Priority", html, config) require.NoError(t, err) assert.Len(t, chunks, 1) assert.True(t, chunks[0].HasHeading) }) } func TestOverlapFunctionality(t *testing.T) { html := `

This is the first sentence. It provides context for the next part.

This is the second sentence. It should be part of the overlap.

This is the third sentence. This marks the beginning of the second chunk.

This is the fourth sentence. More content for the second chunk here.

` t.Run("Overlap Extraction", func(t *testing.T) { config := newTestConfig(20, 10, 8) // Max 20, Overlap 8 chunks, err := ChunkHTMLContent("Overlap", html, config) require.NoError(t, err) require.True(t, len(chunks) >= 2) if len(chunks) >= 2 { chunk1Text := HTML2Text(chunks[0].OriginalHTML) chunk2Text := HTML2Text(chunks[1].OriginalHTML) assert.Contains(t, chunk1Text, "first sentence") // Check for some form of overlap (implementation may vary) t.Logf("Chunk 1: %s", chunk1Text) t.Logf("Chunk 2: %s", chunk2Text) } }) t.Run("Zero Overlap Configuration", func(t *testing.T) { config := newTestConfig(20, 10, 0) // No overlap chunks, err := ChunkHTMLContent("No Overlap", html, config) require.NoError(t, err) require.True(t, len(chunks) >= 1) t.Logf("Zero overlap test resulted in %d chunks", len(chunks)) }) t.Run("Sentence Boundary Test", func(t *testing.T) { htmlWithVariousEndings := `

Question sentence? Another with exclamation! Normal sentence.

Sentence with ellipsis... And another normal one.

` config := newTestConfig(15, 8, 5) chunks, err := ChunkHTMLContent("Sentences", htmlWithVariousEndings, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1) }) } func TestMetadataAndOutput(t *testing.T) { html := `

Main Title

Some paragraph text.

var x = 1;
data
` config := newTestConfig(100, 10, 5) chunks, err := ChunkHTMLContent("Metadata Test", html, config) require.NoError(t, err) require.Len(t, chunks, 1) chunk := chunks[0] t.Run("Struct Fields", func(t *testing.T) { assert.True(t, chunk.HasHeading, "Should detect heading") assert.True(t, chunk.HasCode, "Should detect code") assert.True(t, chunk.HasTable, "Should detect table") assert.Equal(t, 0, chunk.ChunkIndex) assert.Equal(t, 1, chunk.TotalChunks) }) t.Run("Token Counting Accuracy", func(t *testing.T) { text := HTML2Text(chunk.OriginalHTML) expectedTokens := simpleTokenizer(text) metadataTokens, ok := chunk.Metadata["tokens"].(int) require.True(t, ok, "Metadata should contain tokens count") // Allow some variance since internal processing may differ assert.InDelta(t, expectedTokens, metadataTokens, 10, "Token count in metadata should be close to actual") }) t.Run("Format for Embedding", func(t *testing.T) { assert.Contains(t, chunk.Text, "Title: Metadata Test") assert.Contains(t, chunk.Text, "Content:") }) } func TestDefaultTokenizer(t *testing.T) { testCases := []struct { name string text string expected int // Expected tokens from defaultTokenizer (rune count * 0.4) }{ { name: "Simple English text", text: "Hello world", expected: 4, // 11 runes * 0.4 ≈ 4 tokens }, { name: "Longer text", text: "This is a longer sentence with multiple words", expected: 18, // 45 runes * 0.4 = 18 tokens }, { name: "Unicode characters", text: "Hello 世界", expected: 3, // 8 runes * 0.4 ≈ 3 tokens }, { name: "Empty text", text: "", expected: 0, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { tokens := defaultTokenizer(tc.text) assert.Equal(t, tc.expected, tokens) }) } } func TestGetPriority(t *testing.T) { testCases := []struct { tag string expected int }{ {"h1", 1}, {"h2", 1}, {"h3", 2}, {"pre", 2}, {"code", 2}, {"p", 3}, {"table", 3}, {"div", 4}, {"span", 5}, {"unknown", 5}, } for _, tc := range testCases { t.Run(tc.tag, func(t *testing.T) { priority := getPriority(tc.tag) assert.Equal(t, tc.expected, priority) }) } } // getRealWorldHTMLSamples returns realistic HTML samples for comprehensive testing func getRealWorldHTMLSamples() map[string]string { return map[string]string{ "api_reference": `

User API Reference

Authentication

All API requests require authentication using an API key in the header:

Authorization: Bearer your_api_key_here

Create User

Creates a new user account in the system.

Request

POST /api/v1/users

Parameters

ParameterTypeRequiredDescription
emailstringYesUser's email address
first_namestringYesUser's first name
last_namestringNoUser's last name
rolestringNoUser role: admin, agent, or user

Example Request

{
  "email": "john.doe@example.com",
  "first_name": "John",
  "last_name": "Doe",
  "role": "agent"
}

Response

Returns the created user object:

{
  "id": 123,
  "email": "john.doe@example.com",
  "first_name": "John",
  "last_name": "Doe",
  "role": "agent",
  "created_at": "2025-01-15T10:30:00Z"
}
`, "troubleshooting_guide": `

Email Not Working? Troubleshooting Guide

Follow these steps to diagnose and fix email delivery issues:

Step 1: Check Your Email Settings

Step 2: Test Email Delivery

If Step 1 doesn't resolve the issue:

If test email works:

If test email doesn't work:

Step 3: Advanced Troubleshooting

Still having issues? Try these advanced steps:

`, "wysiwyg_nightmare": `

Account Setup Guide

 

Welcome to our platform! Getting started is easy.

 

Important: Please read all instructions carefully.

 

Step 1: Create Your Account

  1. Navigate to the registration page
  2. Fill in your email address
  3. Choose a strong password (minimum 8 characters)

 

Step 2: Verify Your Email

Check your inbox for a verification email. Note: It may take up to 5 minutes to arrive.

If you don't see it, check your spam folder.

 

"Make sure to complete verification within 24 hours, or you'll need to register again."
`, "legal_wall_text": `

Terms of Service

These Terms of Service ("Terms") govern your use of our website and services. By accessing or using our services, you agree to be bound by these Terms. If you disagree with any part of these terms, then you may not access the service. This Terms of Service agreement for our service has been created with the help of legal counsel and covers all the important aspects of using our platform. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Continued use of the service after any such changes shall constitute your consent to such changes. You can review the most current version of the Terms of Service at any time by visiting this page. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Violation of any of the terms below will result in the termination of your account and your access to the service. While we prohibit such conduct and content on the service, you understand and agree that we cannot be responsible for the content posted on the service and you nonetheless may be exposed to such materials. You agree to use the service at your own risk.

You must be 13 years or older to use this service. You must be human and you must provide us with accurate information when you register for an account. Your login may only be used by one person and a single login shared by multiple people is not permitted. You are responsible for maintaining the security of your account and password. The company cannot and will not be liable for any loss or damage from your failure to comply with this security obligation. You are responsible for all content posted and all actions taken with your account. We reserve the right to refuse service to anyone for any reason at any time. We reserve the right to force forfeiture of any username that becomes inactive, violates trademark, or may mislead other users.

`, "feature_comparison_table": `

Pricing Plans Comparison

Choose the plan that best fits your business needs:

Feature Starter
$29/month
Professional
$79/month
Enterprise
$199/month
Support Agents Up to 3 Up to 10 Unlimited
Monthly Conversations 500 5,000 Unlimited
Email Support
Live Chat Widget
Knowledge Base Basic Advanced Advanced + AI
Custom Branding ×
Advanced Analytics ×
API Access × Basic Full Access
Priority Support × ×

Additional Features

`, "poorly_structured_html": `

Getting Started

Introduction

This is not really a quote but we're using blockquote for styling

Prerequisites

This should be a heading but it's a div

Some normal paragraph text here.

Another Main Section

Wait, this should be an h2
Fake heading in a span
This content is in divs instead of paragraphs for some reason.
Another line in a div.

Finally a proper h2

Skipping h3 entirely

Table cell used as heading
Regular table content here
`, "minimalist_haiku": `

Quick Start

Install

npm install

Configure

Edit config.json

Run

npm start

Test

npm test

Deploy

Push to production

Database

PostgreSQL

Cache

Redis

Storage

S3

Monitoring

Datadog

Logging

Sentry

`, "release_notes": `

Release Notes

Version 2.1.0 - January 15, 2025

New Features

Bug Fixes

Version 2.0.3 - December 20, 2024

New Features

Bug Fixes

Version 2.0.2 - November 30, 2024

New Features

Bug Fixes

`, "image_heavy_guide": `

Setting Up Your Live Chat Widget

Follow these visual steps to add the chat widget to your website:

Step 1: Access Widget Settings

Screenshot of admin dashboard with widget settings highlighted

Navigate to Admin > Channels > Live Chat and click on your chat channel.

Step 2: Copy the Widget Code

Screenshot showing the widget code section with copy button

In the Widget Code section, click the "Copy Code" button to copy the JavaScript snippet.

Step 3: Add Code to Your Website

Screenshot of website HTML with widget code pasted before closing body tag

Paste the code just before the closing </body> tag in your website's HTML.

Step 4: Test the Widget

Screenshot of website with chat widget visible in bottom right corner

Visit your website and verify the chat widget appears in the bottom right corner.

Step 5: Customize Appearance

Screenshot of widget customization options showing color and position settings

Back in the admin panel, you can customize the widget's color, position, and welcome message.

`, "nested_lists": `

10 Ways to Improve Customer Support

1. Response Time Optimization

2. Knowledge Management

3. Team Training

`, "faq_description_lists": `

Frequently Asked Questions

Find answers to common questions about our platform:

Account & Billing

How do I change my subscription plan?
You can upgrade or downgrade your plan at any time from your account settings. Navigate to Billing > Subscription and select your new plan. Changes take effect immediately for upgrades, or at the next billing cycle for downgrades.
Can I get a refund if I'm not satisfied?
Yes, we offer a 30-day money-back guarantee for all new subscriptions. Contact our support team within 30 days of your initial purchase for a full refund.
Do you offer annual billing discounts?
Absolutely! Annual subscriptions receive a 20% discount compared to monthly billing. You can switch to annual billing from your account settings at any time.

Technical Support

What browsers do you support?
Our platform works best with modern browsers including Chrome 90+, Firefox 88+, Safari 14+, and Edge 90+. We recommend keeping your browser updated for the best experience.
Is my data secure?
Yes, we take security seriously. All data is encrypted in transit and at rest using industry-standard encryption. We're SOC 2 compliant and undergo regular security audits.
Can I integrate with my existing tools?
We offer integrations with 100+ popular tools including Slack, Salesforce, HubSpot, Zapier, and more. Check our integrations page for a complete list, or use our REST API for custom integrations.

Getting Started

How long does setup take?
Most customers are up and running within 15 minutes. Our setup wizard guides you through the essential configuration steps, and you can always customize further later.
Do you provide onboarding assistance?
Yes! All paid plans include free onboarding support. We'll help you configure your account, import your data, and train your team. Enterprise customers get dedicated onboarding specialists.
`, "kitchen_sink": `

Complete Getting Started Guide

Welcome to the most comprehensive guide for setting up your customer support platform. This guide covers everything you need to know.

Pro Tip: Bookmark this page for easy reference during setup!

Table of Contents

  1. Account Setup
  2. Team Management
  3. Channel Configuration
  4. Advanced Features

1. Account Setup

First things first - let's get your account properly configured:

Basic Information

SettingRecommended ValueNotes
Session timeout30 minutesBalances security and usability
Auto-save interval30 secondsPrevents data loss
LanguageAuto-detectBased on user browser

Configuration Example

{
  "company": {
    "name": "Acme Corp",
    "timezone": "America/New_York",
    "business_hours": {
      "start": "09:00",
      "end": "17:00",
      "days": ["monday", "tuesday", "wednesday", "thursday", "friday"]
    }
  }
}

2. Team Management

Add your team members and configure their roles:

Team management interface showing user roles and permissions
The team management interface allows you to control access and permissions

User Roles

Administrator
Full access to all features and settings. Can manage billing and users.
Agent
Can handle conversations, view reports, and manage their own settings.
Viewer
Read-only access to conversations and reports. Cannot respond to customers.

Advanced Configuration

Click to expand advanced options

These settings are for power users who need fine-grained control:

  • Custom CSS for widget styling
  • Webhook configuration for external integrations
  • Advanced routing rules and automation

Need Help?

If you get stuck during setup, we're here to help:

`, "markdown_import": `

API Documentation

This documentation covers the REST API endpoints for our platform.

Authentication

All API requests require authentication using an API key:

curl -H "Authorization: Bearer YOUR_API_KEY" https://api.example.com/v1/users

Rate Limiting

API requests are limited to 1000 requests per hour per API key.

Rate Limit Headers

Error Handling

The API returns standard HTTP status codes:

Error Response Format

{
  "error": {
    "code": "VALIDATION_ERROR",
    "message": "The email field is required.",
    "details": {
      "field": "email",
      "code": "required"
    }
  }
}

Users Endpoint

List Users

GET /v1/users

Returns a paginated list of users.

Parameters

`, "interactive_transcript": `

Customer Onboarding Flow

This interactive guide walks you through our customer onboarding process:

Before You Start

Make sure you have admin access to customize the onboarding flow.

Step 1: Welcome Message

Configure the first message customers see when they sign up:

Welcome to [Company Name]! 
We're excited to have you on board. 
Let's get you set up in just a few minutes.

Important Note

Keep welcome messages short and friendly. Long text can overwhelm new users.

Step 2: Data Collection

Gather essential information from new customers:

Required Fields:

  • Company name
  • Industry
  • Team size
  • Primary use case

Best Practice: Only ask for information you'll actually use. Each additional field reduces completion rates.

Step 3: Feature Introduction

Introduce key features through guided tours:

Tour Stops:

  • Dashboard overview
  • Creating first conversation
  • Setting up team members
  • Configuring notifications

Pro Tip

Allow users to skip tours and return to them later. Not everyone learns the same way!

`, "giant_code_block": `

Complete Configuration File

Below is the complete configuration file for our application. Copy this to your config.toml file:

# LibreDesk Configuration File
# This file contains all configuration options for the application

[app]
name = "LibreDesk"
version = "0.9.0"
environment = "production"
debug = false
log_level = "info"

[server]
host = "0.0.0.0"
port = 8080
read_timeout = "30s"
write_timeout = "30s"
idle_timeout = "120s"
max_header_bytes = 1048576

[database]
driver = "postgres"
host = "localhost"
port = 5432
name = "libredesk"
user = "postgres"
password = "your_password_here"
sslmode = "disable"
max_open_connections = 25
max_idle_connections = 5
connection_max_lifetime = "1h"

[redis]
host = "localhost"
port = 6379
password = ""
database = 0
max_retries = 3
pool_size = 10

[email]
driver = "smtp"
host = "smtp.gmail.com"
port = 587
username = "your_email@gmail.com"
password = "your_app_password"
from_address = "noreply@yourcompany.com"
from_name = "Your Company Support"

[storage]
driver = "local"
local_path = "./uploads"
max_file_size = "10MB"
allowed_extensions = ["jpg", "jpeg", "png", "gif", "pdf", "doc", "docx"]

[jwt]
secret = "your_super_secret_jwt_key_here"
expiry = "24h"
refresh_expiry = "168h"

[webhook]
queue_size = 1000
concurrency = 5
timeout = "10s"
retry_attempts = 3
retry_delay = "1s"

[ai]
provider = "openai"
api_key = "your_openai_api_key"
model = "gpt-4"
max_tokens = 1000
temperature = 0.7
system_prompt = "You are a helpful customer support assistant."

[embedding]
provider = "openai"
model = "text-embedding-ada-002"
dimensions = 1536
batch_size = 100

[search]
engine = "postgresql"
min_score = 0.5
max_results = 10
boost_title = 2.0
boost_content = 1.0

[monitoring]
enabled = true
metrics_endpoint = "/metrics"
health_endpoint = "/health"
profiler_enabled = false

[rate_limiting]
enabled = true
requests_per_minute = 60
burst_size = 100
cleanup_interval = "1m"

[cors]
allowed_origins = ["http://localhost:3000", "https://yourcompany.com"]
allowed_methods = ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
allowed_headers = ["Content-Type", "Authorization", "X-Requested-With"]
exposed_headers = ["X-Total-Count"]
allow_credentials = true
max_age = "12h"

[security]
bcrypt_cost = 12
session_timeout = "30m"
max_login_attempts = 5
lockout_duration = "15m"
require_https = true
csrf_protection = true

[notifications]
email_enabled = true
webhook_enabled = true
slack_enabled = false
discord_enabled = false

[limits]
max_conversations_per_contact = 1000
max_messages_per_conversation = 10000
max_attachments_per_message = 5
max_tags_per_conversation = 10
max_custom_attributes = 50

After updating your configuration file, restart the application to apply the changes:

sudo systemctl restart libredesk
`, } } func TestRealWorldScenarios(t *testing.T) { samples := getRealWorldHTMLSamples() config := newTestConfig(150, 50, 15) // Use smaller limits to trigger chunking with word-based tokenizer testCases := []struct { name string htmlKey string expectedMinChunks int expectedMaxChunks int validationCallback func(*testing.T, []HTMLChunk, string) }{ { name: "API Reference Manual", htmlKey: "api_reference", expectedMinChunks: 1, expectedMaxChunks: 8, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should have code blocks and tables properly chunked hasCodeChunk := false hasTableChunk := false for _, chunk := range chunks { if chunk.HasCode { hasCodeChunk = true } if chunk.HasTable { hasTableChunk = true } } assert.True(t, hasCodeChunk, "API reference should have at least one code chunk") assert.True(t, hasTableChunk, "API reference should have at least one table chunk") }, }, { name: "Troubleshooting Guide", htmlKey: "troubleshooting_guide", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should chunk well with nested lists and headings assert.True(t, len(chunks) >= 2, "Troubleshooting guide should split into multiple logical sections") // Check that token distribution is reasonable for i, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) assert.True(t, tokens > 0, "Chunk %d should have content", i) } }, }, { name: "WYSIWYG Nightmare", htmlKey: "wysiwyg_nightmare", expectedMinChunks: 1, expectedMaxChunks: 4, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle messy HTML gracefully assert.True(t, len(chunks) >= 1, "WYSIWYG content should create at least one chunk") // Verify no chunks are empty after cleaning for i, chunk := range chunks { cleanText := HTML2Text(chunk.OriginalHTML) assert.NotEmpty(t, strings.TrimSpace(cleanText), "Chunk %d should not be empty after HTML cleanup", i) } }, }, { name: "Legal Wall of Text", htmlKey: "legal_wall_text", expectedMinChunks: 1, expectedMaxChunks: 3, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle very long paragraphs by splitting appropriately if len(chunks) > 1 { for i, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) // No chunk should be excessively large (allow some tolerance) assert.True(t, tokens <= config.MaxTokens*2, "Chunk %d should not be excessively large (%d tokens)", i, tokens) } } }, }, { name: "Feature Comparison Table", htmlKey: "feature_comparison_table", expectedMinChunks: 1, expectedMaxChunks: 4, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should preserve table structure hasTable := false for _, chunk := range chunks { if chunk.HasTable { hasTable = true // Table chunk should contain table structure assert.Contains(t, chunk.OriginalHTML, "= 1, "Should handle poorly structured HTML") // Verify chunker doesn't break on weird nesting for i, chunk := range chunks { assert.NotEmpty(t, chunk.OriginalHTML, "Chunk %d should have content", i) } }, }, { name: "Minimalist Haiku", htmlKey: "minimalist_haiku", expectedMinChunks: 1, expectedMaxChunks: 3, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should aggregate small sections appropriately totalTokens := 0 for _, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) totalTokens += tokens } // With many small sections, chunker should merge appropriately assert.True(t, totalTokens > 0, "Should have some content") }, }, { name: "Release Notes", htmlKey: "release_notes", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should chunk by version sections hasHeadings := false for _, chunk := range chunks { if chunk.HasHeading { hasHeadings = true } } assert.True(t, hasHeadings, "Release notes should maintain heading structure") }, }, { name: "Image Heavy Guide", htmlKey: "image_heavy_guide", expectedMinChunks: 2, expectedMaxChunks: 8, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should chunk around steps with images stepCount := 0 for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "

Step") { stepCount++ } } assert.True(t, stepCount >= 1, "Should preserve step-based structure") }, }, { name: "Nested Lists", htmlKey: "nested_lists", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle deeply nested lists without breaking hierarchy for i, chunk := range chunks { // Check that nested content makes sense assert.NotEmpty(t, HTML2Text(chunk.OriginalHTML), "Chunk %d should have meaningful content", i) } }, }, { name: "FAQ Description Lists", htmlKey: "faq_description_lists", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle dl/dt/dd structure appropriately hasDescriptionList := false for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "
") { hasDescriptionList = true } } assert.True(t, hasDescriptionList, "Should preserve description list structure") }, }, { name: "Kitchen Sink", htmlKey: "kitchen_sink", expectedMinChunks: 3, expectedMaxChunks: 10, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle all element types hasHeading := false hasCode := false hasTable := false for _, chunk := range chunks { if chunk.HasHeading { hasHeading = true } if chunk.HasCode { hasCode = true } if chunk.HasTable { hasTable = true } } assert.True(t, hasHeading, "Kitchen sink should have headings") assert.True(t, hasCode, "Kitchen sink should have code") assert.True(t, hasTable, "Kitchen sink should have tables") }, }, { name: "Markdown Import", htmlKey: "markdown_import", expectedMinChunks: 2, expectedMaxChunks: 8, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle clean markdown-generated HTML hasCode := false for _, chunk := range chunks { if chunk.HasCode { hasCode = true } } assert.True(t, hasCode, "Markdown import should preserve code blocks") }, }, { name: "Interactive Transcript", htmlKey: "interactive_transcript", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle custom div elements with data attributes hasCustomDivs := false for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "data-type=") { hasCustomDivs = true } } assert.True(t, hasCustomDivs, "Should preserve custom interactive elements") }, }, { name: "Giant Code Block", htmlKey: "giant_code_block", expectedMinChunks: 1, expectedMaxChunks: 3, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should truncate oversized code blocks to respect max tokens hasCodeBlocks := false for _, chunk := range chunks { if chunk.HasCode { hasCodeBlocks = true tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) // Each chunk should respect max token limit (150 in test config) assert.LessOrEqual(t, tokens, 150, "Code block chunk should not exceed max tokens after truncation") t.Logf("Code block chunk has %d tokens", tokens) } } assert.True(t, hasCodeBlocks, "Should have code blocks") }, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { html, exists := samples[tc.htmlKey] require.True(t, exists, "Test HTML sample %s should exist", tc.htmlKey) chunks, err := ChunkHTMLContent(tc.name, html, config) require.NoError(t, err, "Chunking should not fail for %s", tc.name) // Basic validation assert.GreaterOrEqual(t, len(chunks), tc.expectedMinChunks, "Should have at least %d chunks for %s", tc.expectedMinChunks, tc.name) assert.LessOrEqual(t, len(chunks), tc.expectedMaxChunks, "Should have at most %d chunks for %s", tc.expectedMaxChunks, tc.name) // Verify chunk metadata for i, chunk := range chunks { assert.Equal(t, i, chunk.ChunkIndex, "Chunk index should be correct") assert.Equal(t, len(chunks), chunk.TotalChunks, "Total chunks should be correct") assert.NotEmpty(t, chunk.Text, "Chunk text should not be empty") assert.Contains(t, chunk.Text, tc.name, "Chunk should contain title") } // Token distribution validation totalTokens := 0 for i, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) totalTokens += tokens // Log token distribution for analysis t.Logf("Chunk %d: %d tokens", i, tokens) } // Scenario-specific validation if tc.validationCallback != nil { tc.validationCallback(t, chunks, tc.name) } t.Logf("Scenario '%s': %d chunks, %d total tokens", tc.name, len(chunks), totalTokens) }) } } // TestChunkHTMLContent_ConfigurableTokenLimits tests that custom token limits work correctly func TestChunkHTMLContent_ConfigurableTokenLimits(t *testing.T) { // Large HTML content with table and code largeHTML := `

API Documentation

This is a comprehensive guide to our API endpoints with detailed examples.

EndpointMethodDescriptionParametersResponse
/api/usersGETGet all userspage, limitJSON array of users
/api/users/{id}GETGet user by IDid (path)JSON user object
/api/usersPOSTCreate new username, email, roleCreated user object
/api/users/{id}PUTUpdate userid (path), name, email, roleUpdated user object
/api/users/{id}DELETEDelete userid (path)Success message

Authentication

All API endpoints require authentication using JWT tokens in the Authorization header.

curl -H "Authorization: Bearer YOUR_TOKEN" -X GET https://api.example.com/users

Additional content to make this chunk larger and test the token limits effectively.

` // Test with default config (smaller chunks) defaultChunks, err := ChunkHTMLContent("API Guide", largeHTML) require.NoError(t, err) // Test with larger token config (should create fewer, larger chunks) largeConfig := ChunkConfig{ MaxTokens: 2000, // Much larger than default 700 MinTokens: 400, // Larger than default 200 OverlapTokens: 150, // Larger than default 75 TokenizerFunc: simpleTokenizer, PreserveBlocks: []string{"pre", "code", "table"}, } largeChunks, err := ChunkHTMLContent("API Guide", largeHTML, largeConfig) require.NoError(t, err) // Verify that larger config creates fewer chunks assert.True(t, len(largeChunks) <= len(defaultChunks), "Larger token config should create fewer or equal chunks. Default: %d, Large: %d", len(defaultChunks), len(largeChunks)) // Verify that chunks contain expected metadata for _, chunk := range largeChunks { tokens, ok := chunk.Metadata["tokens"].(int) assert.True(t, ok, "Chunk should have token count in metadata") assert.True(t, tokens > 0, "Token count should be positive") // Check that we're respecting the larger config if tokens > 700 { // Old default limit t.Logf("✅ Large chunk with %d tokens (exceeds old 700 limit)", tokens) } } t.Logf("Default config: %d chunks, Large config: %d chunks", len(defaultChunks), len(largeChunks)) }