", config: newTestConfig(100, 10, 5), expectedChunks: 1, // Should still parse leniently }, { name: "Deeply Nested HTML", html: "
Deep
", config: newTestConfig(100, 10, 5), expectedChunks: 1, }, { name: "HTML Entities and Special Characters", html: "
This is & some text with <entities> and unicode © characters.
", config: newTestConfig(100, 10, 5), expectedChunks: 1, }, { name: "Excessive Whitespace", html: "
\n\t leading and trailing spaces \n\n
", config: newTestConfig(100, 10, 5), expectedChunks: 1, validate: func(t *testing.T, chunks []HTMLChunk) { // Check that excessive whitespace is handled assert.NotContains(t, chunks[0].Text, " ", "Whitespace should be normalized") }, }, { name: "Giant Token Test - Single Massive Block", html: "
" + strings.Repeat("word ", 1000) + "
", // 1000 tokens in one block config: newTestConfig(50, 20, 10), expectedChunks: 1, // Should truncate oversized content to fit max tokens }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { chunks, err := ChunkHTMLContent("Edge Case", tc.html, tc.config) if tc.expectedError != "" { require.Error(t, err) assert.Contains(t, err.Error(), tc.expectedError) } else { require.NoError(t, err) assert.Len(t, chunks, tc.expectedChunks) if tc.validate != nil { tc.validate(t, chunks) } } }) } } func TestChunkingLogic(t *testing.T) { t.Run("Large Content Exceeding MaxTokens", func(t *testing.T) { html := generateHTML("h3", "word1 word2 word3 word4 word5", 20) // 100 words/tokens config := newTestConfig(50, 20, 10) chunks, err := ChunkHTMLContent("Large Content", html, config) require.NoError(t, err) assert.True(t, len(chunks) > 1, "Should be split into multiple chunks") for _, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) if tokens > config.MaxTokens { t.Logf("Warning: Chunk with %d tokens exceeds MaxTokens of %d", tokens, config.MaxTokens) } } }) t.Run("PreserveBlocks Functionality - No Split Zone", func(t *testing.T) { html := `
This is some text before.

This is a code block that should not be split. It contains many words to exceed the token limit if it were normal text. one two three four five six seven eight nine ten eleven twelve thirteen.

This is some text after.
` config := newTestConfig(20, 10, 5) chunks, err := ChunkHTMLContent("Preserve", html, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1) // With simple truncation, oversized content gets truncated to fit max tokens // We should still have some content that was originally code hasCodeContent := false for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "code block") { hasCodeContent = true // Verify it respects token limits tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) assert.LessOrEqual(t, tokens, 20, "Truncated content should respect max tokens") break } } assert.True(t, hasCodeContent, "Should have truncated code content") }) t.Run("Priority-based chunking (headings)", func(t *testing.T) { html := `
This is paragraph one. It has enough text to be a chunk with many words here.

This is a Heading

This is paragraph two, which should start in a new chunk.
` config := newTestConfig(30, 5, 3) chunks, err := ChunkHTMLContent("Headings", html, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1) // Check if we have heading chunks hasHeadingChunk := false for _, chunk := range chunks { if chunk.HasHeading { hasHeadingChunk = true break } } assert.True(t, hasHeadingChunk, "Should have at least one chunk with heading") }) t.Run("Boundary merging for small elements - Micro Token Test", func(t *testing.T) { html := `
Small one.

Small two.

Small three.
` config := newTestConfig(50, 5, 3) chunks, err := ChunkHTMLContent("Merging", html, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1, "Should create at least one chunk") }) t.Run("Priority Conflict Test", func(t *testing.T) { html := `
Important Heading Inside Low Priority Container
` config := newTestConfig(50, 10, 5) chunks, err := ChunkHTMLContent("Priority", html, config) require.NoError(t, err) assert.Len(t, chunks, 1) assert.True(t, chunks[0].HasHeading) }) } func TestOverlapFunctionality(t *testing.T) { html := `
This is the first sentence. It provides context for the next part.

This is the second sentence. It should be part of the overlap.

This is the third sentence. This marks the beginning of the second chunk.

This is the fourth sentence. More content for the second chunk here.
` t.Run("Overlap Extraction", func(t *testing.T) { config := newTestConfig(20, 10, 8) // Max 20, Overlap 8 chunks, err := ChunkHTMLContent("Overlap", html, config) require.NoError(t, err) require.True(t, len(chunks) >= 2) if len(chunks) >= 2 { chunk1Text := HTML2Text(chunks[0].OriginalHTML) chunk2Text := HTML2Text(chunks[1].OriginalHTML) assert.Contains(t, chunk1Text, "first sentence") // Check for some form of overlap (implementation may vary) t.Logf("Chunk 1: %s", chunk1Text) t.Logf("Chunk 2: %s", chunk2Text) } }) t.Run("Zero Overlap Configuration", func(t *testing.T) { config := newTestConfig(20, 10, 0) // No overlap chunks, err := ChunkHTMLContent("No Overlap", html, config) require.NoError(t, err) require.True(t, len(chunks) >= 1) t.Logf("Zero overlap test resulted in %d chunks", len(chunks)) }) t.Run("Sentence Boundary Test", func(t *testing.T) { htmlWithVariousEndings := `
Question sentence? Another with exclamation! Normal sentence.

Sentence with ellipsis... And another normal one.
` config := newTestConfig(15, 8, 5) chunks, err := ChunkHTMLContent("Sentences", htmlWithVariousEndings, config) require.NoError(t, err) assert.True(t, len(chunks) >= 1) }) } func TestMetadataAndOutput(t *testing.T) { html := `
Main Title

Some paragraph text.

var x = 1;

data
` config := newTestConfig(100, 10, 5) chunks, err := ChunkHTMLContent("Metadata Test", html, config) require.NoError(t, err) require.Len(t, chunks, 1) chunk := chunks[0] t.Run("Struct Fields", func(t *testing.T) { assert.True(t, chunk.HasHeading, "Should detect heading") assert.True(t, chunk.HasCode, "Should detect code") assert.True(t, chunk.HasTable, "Should detect table") assert.Equal(t, 0, chunk.ChunkIndex) assert.Equal(t, 1, chunk.TotalChunks) }) t.Run("Token Counting Accuracy", func(t *testing.T) { text := HTML2Text(chunk.OriginalHTML) expectedTokens := simpleTokenizer(text) metadataTokens, ok := chunk.Metadata["tokens"].(int) require.True(t, ok, "Metadata should contain tokens count") // Allow some variance since internal processing may differ assert.InDelta(t, expectedTokens, metadataTokens, 10, "Token count in metadata should be close to actual") }) t.Run("Format for Embedding", func(t *testing.T) { assert.Contains(t, chunk.Text, "Title: Metadata Test") assert.Contains(t, chunk.Text, "Content:") }) } func TestDefaultTokenizer(t *testing.T) { testCases := []struct { name string text string expected int // Expected tokens from defaultTokenizer (rune count * 0.4) }{ { name: "Simple English text", text: "Hello world", expected: 4, // 11 runes * 0.4 ≈ 4 tokens }, { name: "Longer text", text: "This is a longer sentence with multiple words", expected: 18, // 45 runes * 0.4 = 18 tokens }, { name: "Unicode characters", text: "Hello 世界", expected: 3, // 8 runes * 0.4 ≈ 3 tokens }, { name: "Empty text", text: "", expected: 0, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { tokens := defaultTokenizer(tc.text) assert.Equal(t, tc.expected, tokens) }) } } func TestGetPriority(t *testing.T) { testCases := []struct { tag string expected int }{ {"h1", 1}, {"h2", 1}, {"h3", 2}, {"pre", 2}, {"code", 2}, {"p", 3}, {"table", 3}, {"div", 4}, {"span", 5}, {"unknown", 5}, } for _, tc := range testCases { t.Run(tc.tag, func(t *testing.T) { priority := getPriority(tc.tag) assert.Equal(t, tc.expected, priority) }) } } // getRealWorldHTMLSamples returns realistic HTML samples for comprehensive testing func getRealWorldHTMLSamples() map[string]string { return map[string]string{ "api_reference": `
User API Reference

Authentication

All API requests require authentication using an API key in the header:

Authorization: Bearer your_api_key_here

Create User

Creates a new user account in the system.

Request

POST /api/v1/users

Parameters

Parameter Type Required Description

email string Yes User's email address

first_name string Yes User's first name

last_name string No User's last name

role string No User role: admin, agent, or user

Example Request

{ "email": "john.doe@example.com", "first_name": "John", "last_name": "Doe", "role": "agent" }

Response

Returns the created user object:

{ "id": 123, "email": "john.doe@example.com", "first_name": "John", "last_name": "Doe", "role": "agent", "created_at": "2025-01-15T10:30:00Z" }
`, "troubleshooting_guide": `
Email Not Working? Troubleshooting Guide

Follow these steps to diagnose and fix email delivery issues:

Step 1: Check Your Email Settings

Verify SMTP server settings are correct

Check if port 587 or 465 is properly configured

Ensure authentication credentials are valid

Step 2: Test Email Delivery

If Step 1 doesn't resolve the issue:

Send a test email to yourself

Check spam/junk folders

Try sending to a different email provider (Gmail, Yahoo, etc.)

If test email works:

The issue is likely with the recipient's email

Ask them to check their spam folder

Verify the recipient email address is correct

If test email doesn't work:

Check error logs in Admin > System > Logs

Look for SMTP authentication errors

Contact your email provider about potential blocks

Step 3: Advanced Troubleshooting

Still having issues? Try these advanced steps:

Enable debug logging for email delivery

Check DNS records (SPF, DKIM, DMARC)

Test with a different SMTP provider

Contact support with error logs

`, "wysiwyg_nightmare": `
Account Setup Guide

Welcome to our platform! Getting started is easy.

Important: Please read all instructions carefully.

Step 1: Create Your Account

Navigate to the registration page

Fill in your email address

Choose a strong password (minimum 8 characters)

Step 2: Verify Your Email

Check your inbox for a verification email. Note: It may take up to 5 minutes to arrive.

If you don't see it, check your spam folder.

"Make sure to complete verification within 24 hours, or you'll need to register again."
`, "legal_wall_text": `
Terms of Service

These Terms of Service ("Terms") govern your use of our website and services. By accessing or using our services, you agree to be bound by these Terms. If you disagree with any part of these terms, then you may not access the service. This Terms of Service agreement for our service has been created with the help of legal counsel and covers all the important aspects of using our platform. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Continued use of the service after any such changes shall constitute your consent to such changes. You can review the most current version of the Terms of Service at any time by visiting this page. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Violation of any of the terms below will result in the termination of your account and your access to the service. While we prohibit such conduct and content on the service, you understand and agree that we cannot be responsible for the content posted on the service and you nonetheless may be exposed to such materials. You agree to use the service at your own risk.

You must be 13 years or older to use this service. You must be human and you must provide us with accurate information when you register for an account. Your login may only be used by one person and a single login shared by multiple people is not permitted. You are responsible for maintaining the security of your account and password. The company cannot and will not be liable for any loss or damage from your failure to comply with this security obligation. You are responsible for all content posted and all actions taken with your account. We reserve the right to refuse service to anyone for any reason at any time. We reserve the right to force forfeiture of any username that becomes inactive, violates trademark, or may mislead other users.
`, "feature_comparison_table": `
Pricing Plans Comparison

Choose the plan that best fits your business needs:

Feature Starter
$29/month Professional
$79/month Enterprise
$199/month

Support Agents Up to 3 Up to 10 Unlimited

Monthly Conversations 500 5,000 Unlimited

Email Support ✓ ✓ ✓

Live Chat Widget ✓ ✓ ✓

Knowledge Base Basic Advanced Advanced + AI

Custom Branding × ✓ ✓

Advanced Analytics × ✓ ✓

API Access × Basic Full Access

Priority Support × × ✓

Additional Features

All plans include: 24/7 uptime monitoring, SSL encryption, regular backups

Professional and Enterprise: Custom integrations, advanced workflows

Enterprise only: Dedicated account manager, custom SLA, on-premise deployment options

`, "poorly_structured_html": `
Getting Started

Introduction

This is not really a quote but we're using blockquote for styling

Prerequisites

This should be a heading but it's a div

Some normal paragraph text here.

Another Main Section

Wait, this should be an h2

Fake heading in a span
This content is in divs instead of paragraphs for some reason.

Another line in a div.

Finally a proper h2

Skipping h3 entirely

Table cell used as heading

Regular table content here

`, "minimalist_haiku": `
Quick Start

Install

npm install

Configure

Edit config.json

Run

npm start

Test

npm test

Deploy

Push to production

Database

PostgreSQL

Cache

Redis

Storage

S3

Monitoring

Datadog

Logging

Sentry
`, "release_notes": `
Release Notes

Version 2.1.0 - January 15, 2025

New Features

Added AI-powered response suggestions for agents

Implemented advanced search filters in conversation list

Added support for file attachments in live chat

New dashboard widgets for team performance metrics

Bug Fixes

Fixed email notifications not being sent for certain conversation states

Resolved timezone display issues in reporting

Fixed widget positioning on mobile devices

Version 2.0.3 - December 20, 2024

New Features

Added bulk actions for conversation management

Implemented custom fields for customer profiles

Added integration with Slack for team notifications

Bug Fixes

Fixed memory leak in WebSocket connections

Resolved search indexing issues with special characters

Fixed CSV export formatting problems

Version 2.0.2 - November 30, 2024

New Features

Added support for multiple languages in knowledge base

Implemented automated conversation routing based on keywords

Bug Fixes

Fixed authentication issues with SSO providers

Resolved performance issues with large conversation histories

`, "image_heavy_guide": `
Setting Up Your Live Chat Widget

Follow these visual steps to add the chat widget to your website:

Step 1: Access Widget Settings

Navigate to Admin > Channels > Live Chat and click on your chat channel.

Step 2: Copy the Widget Code

In the Widget Code section, click the "Copy Code" button to copy the JavaScript snippet.

Step 3: Add Code to Your Website

Paste the code just before the closing </body> tag in your website's HTML.

Step 4: Test the Widget

Visit your website and verify the chat widget appears in the bottom right corner.

Step 5: Customize Appearance

Back in the admin panel, you can customize the widget's color, position, and welcome message.
`, "nested_lists": `
10 Ways to Improve Customer Support

1. Response Time Optimization

Set clear response time expectations

Email: Within 4 hours during business hours

Live chat: Within 2 minutes

Phone: Answer within 3 rings

Use automation to acknowledge receipt

Auto-reply emails

Chat welcome messages

Ticket confirmation SMS

2. Knowledge Management

Create comprehensive FAQ sections

Common technical issues

Login problems

Password reset

Browser compatibility

Billing and account questions

Payment methods

Subscription changes

Refund policies

Maintain up-to-date documentation

Review quarterly

Update with new features

Remove outdated information

3. Team Training

Product knowledge training

Monthly product updates

Hands-on feature testing

Cross-departmental sessions

Communication skills development

Active listening techniques

Empathy building exercises

Conflict resolution strategies

`, "faq_description_lists": `
Frequently Asked Questions

Find answers to common questions about our platform:

Account & Billing

How do I change my subscription plan?

You can upgrade or downgrade your plan at any time from your account settings. Navigate to Billing > Subscription and select your new plan. Changes take effect immediately for upgrades, or at the next billing cycle for downgrades.

Can I get a refund if I'm not satisfied?

Yes, we offer a 30-day money-back guarantee for all new subscriptions. Contact our support team within 30 days of your initial purchase for a full refund.

Do you offer annual billing discounts?

Absolutely! Annual subscriptions receive a 20% discount compared to monthly billing. You can switch to annual billing from your account settings at any time.

Technical Support

What browsers do you support?

Our platform works best with modern browsers including Chrome 90+, Firefox 88+, Safari 14+, and Edge 90+. We recommend keeping your browser updated for the best experience.

Is my data secure?

Yes, we take security seriously. All data is encrypted in transit and at rest using industry-standard encryption. We're SOC 2 compliant and undergo regular security audits.

Can I integrate with my existing tools?

We offer integrations with 100+ popular tools including Slack, Salesforce, HubSpot, Zapier, and more. Check our integrations page for a complete list, or use our REST API for custom integrations.

Getting Started

How long does setup take?

Most customers are up and running within 15 minutes. Our setup wizard guides you through the essential configuration steps, and you can always customize further later.

Do you provide onboarding assistance?

Yes! All paid plans include free onboarding support. We'll help you configure your account, import your data, and train your team. Enterprise customers get dedicated onboarding specialists.

`, "kitchen_sink": `
Complete Getting Started Guide

Welcome to the most comprehensive guide for setting up your customer support platform. This guide covers everything you need to know.

Pro Tip: Bookmark this page for easy reference during setup!

Table of Contents

Account Setup

Team Management

Channel Configuration

Advanced Features

1. Account Setup

First things first - let's get your account properly configured:

Basic Information

Company name and details

Time zone configuration

Business hours setup

Setting Recommended Value Notes

Session timeout 30 minutes Balances security and usability

Auto-save interval 30 seconds Prevents data loss

Language Auto-detect Based on user browser

Configuration Example

{ "company": { "name": "Acme Corp", "timezone": "America/New_York", "business_hours": { "start": "09:00", "end": "17:00", "days": ["monday", "tuesday", "wednesday", "thursday", "friday"] } } }

2. Team Management

Add your team members and configure their roles:

The team management interface allows you to control access and permissions

User Roles

Administrator

Full access to all features and settings. Can manage billing and users.

Agent

Can handle conversations, view reports, and manage their own settings.

Viewer

Read-only access to conversations and reports. Cannot respond to customers.

Advanced Configuration

Click to expand advanced options

These settings are for power users who need fine-grained control:

Custom CSS for widget styling

Webhook configuration for external integrations

Advanced routing rules and automation

Need Help?

If you get stuck during setup, we're here to help:

📧 Email: support@example.com

💬 Live chat: Available 24/7

📱 Phone: +1-555-0123

`, "markdown_import": `
API Documentation

This documentation covers the REST API endpoints for our platform.

Authentication

All API requests require authentication using an API key:

curl -H "Authorization: Bearer YOUR_API_KEY" https://api.example.com/v1/users

Rate Limiting

API requests are limited to 1000 requests per hour per API key.

Rate Limit Headers

X-RateLimit-Limit: The rate limit ceiling for your API key

X-RateLimit-Remaining: The number of requests left for the time window

X-RateLimit-Reset: The UTC date/time when the rate limit resets

Error Handling

The API returns standard HTTP status codes:

200 - Success

400 - Bad Request

401 - Unauthorized

404 - Not Found

500 - Internal Server Error

Error Response Format

{ "error": { "code": "VALIDATION_ERROR", "message": "The email field is required.", "details": { "field": "email", "code": "required" } } }

Users Endpoint

List Users

GET /v1/users

Returns a paginated list of users.

Parameters

page (integer, optional): Page number, defaults to 1

limit (integer, optional): Items per page, defaults to 20, max 100

role (string, optional): Filter by user role

`, "interactive_transcript": `
Customer Onboarding Flow

This interactive guide walks you through our customer onboarding process:

Before You Start

Make sure you have admin access to customize the onboarding flow.

Step 1: Welcome Message

Configure the first message customers see when they sign up:

Welcome to [Company Name]! We're excited to have you on board. Let's get you set up in just a few minutes.

Important Note

Keep welcome messages short and friendly. Long text can overwhelm new users.

Step 2: Data Collection

Gather essential information from new customers:

Required Fields:

Company name

Industry

Team size

Primary use case

Best Practice: Only ask for information you'll actually use. Each additional field reduces completion rates.

Step 3: Feature Introduction

Introduce key features through guided tours:

Tour Stops:

Dashboard overview

Creating first conversation

Setting up team members

Configuring notifications

Pro Tip

Allow users to skip tours and return to them later. Not everyone learns the same way!

`, "giant_code_block": `
Complete Configuration File

Below is the complete configuration file for our application. Copy this to your config.toml file:

# LibreDesk Configuration File # This file contains all configuration options for the application [app] name = "LibreDesk" version = "0.9.0" environment = "production" debug = false log_level = "info" [server] host = "0.0.0.0" port = 8080 read_timeout = "30s" write_timeout = "30s" idle_timeout = "120s" max_header_bytes = 1048576 [database] driver = "postgres" host = "localhost" port = 5432 name = "libredesk" user = "postgres" password = "your_password_here" sslmode = "disable" max_open_connections = 25 max_idle_connections = 5 connection_max_lifetime = "1h" [redis] host = "localhost" port = 6379 password = "" database = 0 max_retries = 3 pool_size = 10 [email] driver = "smtp" host = "smtp.gmail.com" port = 587 username = "your_email@gmail.com" password = "your_app_password" from_address = "noreply@yourcompany.com" from_name = "Your Company Support" [storage] driver = "local" local_path = "./uploads" max_file_size = "10MB" allowed_extensions = ["jpg", "jpeg", "png", "gif", "pdf", "doc", "docx"] [jwt] secret = "your_super_secret_jwt_key_here" expiry = "24h" refresh_expiry = "168h" [webhook] queue_size = 1000 concurrency = 5 timeout = "10s" retry_attempts = 3 retry_delay = "1s" [ai] provider = "openai" api_key = "your_openai_api_key" model = "gpt-4" max_tokens = 1000 temperature = 0.7 system_prompt = "You are a helpful customer support assistant." [embedding] provider = "openai" model = "text-embedding-ada-002" dimensions = 1536 batch_size = 100 [search] engine = "postgresql" min_score = 0.5 max_results = 10 boost_title = 2.0 boost_content = 1.0 [monitoring] enabled = true metrics_endpoint = "/metrics" health_endpoint = "/health" profiler_enabled = false [rate_limiting] enabled = true requests_per_minute = 60 burst_size = 100 cleanup_interval = "1m" [cors] allowed_origins = ["http://localhost:3000", "https://yourcompany.com"] allowed_methods = ["GET", "POST", "PUT", "DELETE", "OPTIONS"] allowed_headers = ["Content-Type", "Authorization", "X-Requested-With"] exposed_headers = ["X-Total-Count"] allow_credentials = true max_age = "12h" [security] bcrypt_cost = 12 session_timeout = "30m" max_login_attempts = 5 lockout_duration = "15m" require_https = true csrf_protection = true [notifications] email_enabled = true webhook_enabled = true slack_enabled = false discord_enabled = false [limits] max_conversations_per_contact = 1000 max_messages_per_conversation = 10000 max_attachments_per_message = 5 max_tags_per_conversation = 10 max_custom_attributes = 50

After updating your configuration file, restart the application to apply the changes:

sudo systemctl restart libredesk
`, } } func TestRealWorldScenarios(t *testing.T) { samples := getRealWorldHTMLSamples() config := newTestConfig(150, 50, 15) // Use smaller limits to trigger chunking with word-based tokenizer testCases := []struct { name string htmlKey string expectedMinChunks int expectedMaxChunks int validationCallback func(*testing.T, []HTMLChunk, string) }{ { name: "API Reference Manual", htmlKey: "api_reference", expectedMinChunks: 1, expectedMaxChunks: 8, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should have code blocks and tables properly chunked hasCodeChunk := false hasTableChunk := false for _, chunk := range chunks { if chunk.HasCode { hasCodeChunk = true } if chunk.HasTable { hasTableChunk = true } } assert.True(t, hasCodeChunk, "API reference should have at least one code chunk") assert.True(t, hasTableChunk, "API reference should have at least one table chunk") }, }, { name: "Troubleshooting Guide", htmlKey: "troubleshooting_guide", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should chunk well with nested lists and headings assert.True(t, len(chunks) >= 2, "Troubleshooting guide should split into multiple logical sections") // Check that token distribution is reasonable for i, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) assert.True(t, tokens > 0, "Chunk %d should have content", i) } }, }, { name: "WYSIWYG Nightmare", htmlKey: "wysiwyg_nightmare", expectedMinChunks: 1, expectedMaxChunks: 4, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle messy HTML gracefully assert.True(t, len(chunks) >= 1, "WYSIWYG content should create at least one chunk") // Verify no chunks are empty after cleaning for i, chunk := range chunks { cleanText := HTML2Text(chunk.OriginalHTML) assert.NotEmpty(t, strings.TrimSpace(cleanText), "Chunk %d should not be empty after HTML cleanup", i) } }, }, { name: "Legal Wall of Text", htmlKey: "legal_wall_text", expectedMinChunks: 1, expectedMaxChunks: 3, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle very long paragraphs by splitting appropriately if len(chunks) > 1 { for i, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) // No chunk should be excessively large (allow some tolerance) assert.True(t, tokens <= config.MaxTokens*2, "Chunk %d should not be excessively large (%d tokens)", i, tokens) } } }, }, { name: "Feature Comparison Table", htmlKey: "feature_comparison_table", expectedMinChunks: 1, expectedMaxChunks: 4, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should preserve table structure hasTable := false for _, chunk := range chunks { if chunk.HasTable { hasTable = true // Table chunk should contain table structure assert.Contains(t, chunk.OriginalHTML, "= 1, "Should handle poorly structured HTML") // Verify chunker doesn't break on weird nesting for i, chunk := range chunks { assert.NotEmpty(t, chunk.OriginalHTML, "Chunk %d should have content", i) } }, }, { name: "Minimalist Haiku", htmlKey: "minimalist_haiku", expectedMinChunks: 1, expectedMaxChunks: 3, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should aggregate small sections appropriately totalTokens := 0 for _, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) totalTokens += tokens } // With many small sections, chunker should merge appropriately assert.True(t, totalTokens > 0, "Should have some content") }, }, { name: "Release Notes", htmlKey: "release_notes", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should chunk by version sections hasHeadings := false for _, chunk := range chunks { if chunk.HasHeading { hasHeadings = true } } assert.True(t, hasHeadings, "Release notes should maintain heading structure") }, }, { name: "Image Heavy Guide", htmlKey: "image_heavy_guide", expectedMinChunks: 2, expectedMaxChunks: 8, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should chunk around steps with images stepCount := 0 for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "
Step") { stepCount++ } } assert.True(t, stepCount >= 1, "Should preserve step-based structure") }, }, { name: "Nested Lists", htmlKey: "nested_lists", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle deeply nested lists without breaking hierarchy for i, chunk := range chunks { // Check that nested content makes sense assert.NotEmpty(t, HTML2Text(chunk.OriginalHTML), "Chunk %d should have meaningful content", i) } }, }, { name: "FAQ Description Lists", htmlKey: "faq_description_lists", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle dl/dt/dd structure appropriately hasDescriptionList := false for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "
") { hasDescriptionList = true } } assert.True(t, hasDescriptionList, "Should preserve description list structure") }, }, { name: "Kitchen Sink", htmlKey: "kitchen_sink", expectedMinChunks: 3, expectedMaxChunks: 10, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle all element types hasHeading := false hasCode := false hasTable := false for _, chunk := range chunks { if chunk.HasHeading { hasHeading = true } if chunk.HasCode { hasCode = true } if chunk.HasTable { hasTable = true } } assert.True(t, hasHeading, "Kitchen sink should have headings") assert.True(t, hasCode, "Kitchen sink should have code") assert.True(t, hasTable, "Kitchen sink should have tables") }, }, { name: "Markdown Import", htmlKey: "markdown_import", expectedMinChunks: 2, expectedMaxChunks: 8, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle clean markdown-generated HTML hasCode := false for _, chunk := range chunks { if chunk.HasCode { hasCode = true } } assert.True(t, hasCode, "Markdown import should preserve code blocks") }, }, { name: "Interactive Transcript", htmlKey: "interactive_transcript", expectedMinChunks: 2, expectedMaxChunks: 6, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should handle custom div elements with data attributes hasCustomDivs := false for _, chunk := range chunks { if strings.Contains(chunk.OriginalHTML, "data-type=") { hasCustomDivs = true } } assert.True(t, hasCustomDivs, "Should preserve custom interactive elements") }, }, { name: "Giant Code Block", htmlKey: "giant_code_block", expectedMinChunks: 1, expectedMaxChunks: 3, validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) { // Should truncate oversized code blocks to respect max tokens hasCodeBlocks := false for _, chunk := range chunks { if chunk.HasCode { hasCodeBlocks = true tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) // Each chunk should respect max token limit (150 in test config) assert.LessOrEqual(t, tokens, 150, "Code block chunk should not exceed max tokens after truncation") t.Logf("Code block chunk has %d tokens", tokens) } } assert.True(t, hasCodeBlocks, "Should have code blocks") }, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { html, exists := samples[tc.htmlKey] require.True(t, exists, "Test HTML sample %s should exist", tc.htmlKey) chunks, err := ChunkHTMLContent(tc.name, html, config) require.NoError(t, err, "Chunking should not fail for %s", tc.name) // Basic validation assert.GreaterOrEqual(t, len(chunks), tc.expectedMinChunks, "Should have at least %d chunks for %s", tc.expectedMinChunks, tc.name) assert.LessOrEqual(t, len(chunks), tc.expectedMaxChunks, "Should have at most %d chunks for %s", tc.expectedMaxChunks, tc.name) // Verify chunk metadata for i, chunk := range chunks { assert.Equal(t, i, chunk.ChunkIndex, "Chunk index should be correct") assert.Equal(t, len(chunks), chunk.TotalChunks, "Total chunks should be correct") assert.NotEmpty(t, chunk.Text, "Chunk text should not be empty") assert.Contains(t, chunk.Text, tc.name, "Chunk should contain title") } // Token distribution validation totalTokens := 0 for i, chunk := range chunks { tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML)) totalTokens += tokens // Log token distribution for analysis t.Logf("Chunk %d: %d tokens", i, tokens) } // Scenario-specific validation if tc.validationCallback != nil { tc.validationCallback(t, chunks, tc.name) } t.Logf("Scenario '%s': %d chunks, %d total tokens", tc.name, len(chunks), totalTokens) }) } } // TestChunkHTMLContent_ConfigurableTokenLimits tests that custom token limits work correctly func TestChunkHTMLContent_ConfigurableTokenLimits(t *testing.T) { // Large HTML content with table and code largeHTML := `
API Documentation

This is a comprehensive guide to our API endpoints with detailed examples.

Endpoint Method Description Parameters Response

/api/users GET Get all users page, limit JSON array of users

/api/users/{id} GET Get user by ID id (path) JSON user object

/api/users POST Create new user name, email, role Created user object

/api/users/{id} PUT Update user id (path), name, email, role Updated user object

/api/users/{id} DELETE Delete user id (path) Success message

Authentication

All API endpoints require authentication using JWT tokens in the Authorization header.

curl -H "Authorization: Bearer YOUR_TOKEN" -X GET https://api.example.com/users

Additional content to make this chunk larger and test the token limits effectively.
` // Test with default config (smaller chunks) defaultChunks, err := ChunkHTMLContent("API Guide", largeHTML) require.NoError(t, err) // Test with larger token config (should create fewer, larger chunks) largeConfig := ChunkConfig{ MaxTokens: 2000, // Much larger than default 700 MinTokens: 400, // Larger than default 200 OverlapTokens: 150, // Larger than default 75 TokenizerFunc: simpleTokenizer, PreserveBlocks: []string{"pre", "code", "table"}, } largeChunks, err := ChunkHTMLContent("API Guide", largeHTML, largeConfig) require.NoError(t, err) // Verify that larger config creates fewer chunks assert.True(t, len(largeChunks) <= len(defaultChunks), "Larger token config should create fewer or equal chunks. Default: %d, Large: %d", len(defaultChunks), len(largeChunks)) // Verify that chunks contain expected metadata for _, chunk := range largeChunks { tokens, ok := chunk.Metadata["tokens"].(int) assert.True(t, ok, "Chunk should have token count in metadata") assert.True(t, tokens > 0, "Token count should be positive") // Check that we're respecting the larger config if tokens > 700 { // Old default limit t.Logf("✅ Large chunk with %d tokens (exceeds old 700 limit)", tokens) } } t.Logf("Default config: %d chunks, Large config: %d chunks", len(defaultChunks), len(largeChunks)) }

Parameter	Type	Required	Description
email	string	Yes	User's email address
first_name	string	Yes	User's first name
last_name	string	No	User's last name
role	string	No	User role: admin, agent, or user

Feature	Starter $29/month	Professional $79/month	Enterprise $199/month
Support Agents	Up to 3	Up to 10	Unlimited
Monthly Conversations	500	5,000	Unlimited
Email Support	✓	✓	✓
Live Chat Widget	✓	✓	✓
Knowledge Base	Basic	Advanced	Advanced + AI
Custom Branding	×	✓	✓
Advanced Analytics	×	✓	✓
API Access	×	Basic	Full Access
Priority Support	×	×	✓

Setting	Recommended Value	Notes
Session timeout	30 minutes	Balances security and usability
Auto-save interval	30 seconds	Prevents data loss
Language	Auto-detect	Based on user browser

Endpoint	Method	Description	Parameters	Response
/api/users	GET	Get all users	page, limit	JSON array of users
/api/users/{id}	GET	Get user by ID	id (path)	JSON user object
/api/users	POST	Create new user	name, email, role	Created user object
/api/users/{id}	PUT	Update user	id (path), name, email, role	Updated user object
/api/users/{id}	DELETE	Delete user	id (path)	Success message

This is a heading that should create a chunk

This is unclosed text

Deep

This is & some text with <entities> and unicode © characters.

\n\t leading and trailing spaces \n\n

" + strings.Repeat("word ", 1000) + "

This is some text before.

This is some text after.

This is paragraph one. It has enough text to be a chunk with many words here.

This is a Heading

This is paragraph two, which should start in a new chunk.

Small one.

Small two.

Small three.

Important Heading Inside Low Priority Container

This is the first sentence. It provides context for the next part.

This is the second sentence. It should be part of the overlap.

This is the third sentence. This marks the beginning of the second chunk.

This is the fourth sentence. More content for the second chunk here.

Question sentence? Another with exclamation! Normal sentence.

Sentence with ellipsis... And another normal one.

Main Title

User API Reference

Authentication

Create User

Request

Parameters

Example Request

Response

Email Not Working? Troubleshooting Guide

Step 1: Check Your Email Settings

Step 2: Test Email Delivery

If test email works:

If test email doesn't work:

Step 3: Advanced Troubleshooting

Account Setup Guide

Step 1: Create Your Account

Step 2: Verify Your Email

Terms of Service

Pricing Plans Comparison

Additional Features

Getting Started

Introduction

Prerequisites

Another Main Section

Wait, this should be an h2

Finally a proper h2

Skipping h3 entirely

Quick Start

Install

Configure

Run

Test

Deploy

Database

Cache

Storage

Monitoring

Logging

Release Notes

Version 2.1.0 - January 15, 2025

New Features

Bug Fixes

Version 2.0.3 - December 20, 2024

New Features

Bug Fixes

Version 2.0.2 - November 30, 2024

New Features

Bug Fixes

Setting Up Your Live Chat Widget

Step 1: Access Widget Settings

Step 2: Copy the Widget Code

Step 3: Add Code to Your Website

Step 4: Test the Widget

Step 5: Customize Appearance

10 Ways to Improve Customer Support

1. Response Time Optimization

2. Knowledge Management

3. Team Training

Frequently Asked Questions