",
config: newTestConfig(100, 10, 5),
expectedChunks: 1,
validate: func(t *testing.T, chunks []HTMLChunk) {
// Check that excessive whitespace is handled
assert.NotContains(t, chunks[0].Text, " ", "Whitespace should be normalized")
},
},
{
name: "Giant Token Test - Single Massive Block",
html: "
" + strings.Repeat("word ", 1000) + "
", // 1000 tokens in one block
config: newTestConfig(50, 20, 10),
expectedChunks: 1, // Should truncate oversized content to fit max tokens
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
chunks, err := ChunkHTMLContent("Edge Case", tc.html, tc.config)
if tc.expectedError != "" {
require.Error(t, err)
assert.Contains(t, err.Error(), tc.expectedError)
} else {
require.NoError(t, err)
assert.Len(t, chunks, tc.expectedChunks)
if tc.validate != nil {
tc.validate(t, chunks)
}
}
})
}
}
func TestChunkingLogic(t *testing.T) {
t.Run("Large Content Exceeding MaxTokens", func(t *testing.T) {
html := generateHTML("h3", "word1 word2 word3 word4 word5", 20) // 100 words/tokens
config := newTestConfig(50, 20, 10)
chunks, err := ChunkHTMLContent("Large Content", html, config)
require.NoError(t, err)
assert.True(t, len(chunks) > 1, "Should be split into multiple chunks")
for _, chunk := range chunks {
tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
if tokens > config.MaxTokens {
t.Logf("Warning: Chunk with %d tokens exceeds MaxTokens of %d", tokens, config.MaxTokens)
}
}
})
t.Run("PreserveBlocks Functionality - No Split Zone", func(t *testing.T) {
html := `
This is some text before.
This is a code block that should not be split. It contains many words to exceed the token limit if it were normal text. one two three four five six seven eight nine ten eleven twelve thirteen.
This is some text after.
`
config := newTestConfig(20, 10, 5)
chunks, err := ChunkHTMLContent("Preserve", html, config)
require.NoError(t, err)
assert.True(t, len(chunks) >= 1)
// With simple truncation, oversized content gets truncated to fit max tokens
// We should still have some content that was originally code
hasCodeContent := false
for _, chunk := range chunks {
if strings.Contains(chunk.OriginalHTML, "code block") {
hasCodeContent = true
// Verify it respects token limits
tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
assert.LessOrEqual(t, tokens, 20, "Truncated content should respect max tokens")
break
}
}
assert.True(t, hasCodeContent, "Should have truncated code content")
})
t.Run("Priority-based chunking (headings)", func(t *testing.T) {
html := `
This is paragraph one. It has enough text to be a chunk with many words here.
This is a Heading
This is paragraph two, which should start in a new chunk.
`
config := newTestConfig(30, 5, 3)
chunks, err := ChunkHTMLContent("Headings", html, config)
require.NoError(t, err)
assert.True(t, len(chunks) >= 1)
// Check if we have heading chunks
hasHeadingChunk := false
for _, chunk := range chunks {
if chunk.HasHeading {
hasHeadingChunk = true
break
}
}
assert.True(t, hasHeadingChunk, "Should have at least one chunk with heading")
})
t.Run("Boundary merging for small elements - Micro Token Test", func(t *testing.T) {
html := `
Small one.
Small two.
Small three.
`
config := newTestConfig(50, 5, 3)
chunks, err := ChunkHTMLContent("Merging", html, config)
require.NoError(t, err)
assert.True(t, len(chunks) >= 1, "Should create at least one chunk")
})
t.Run("Priority Conflict Test", func(t *testing.T) {
html := `
Follow these steps to diagnose and fix email delivery issues:
Step 1: Check Your Email Settings
Verify SMTP server settings are correct
Check if port 587 or 465 is properly configured
Ensure authentication credentials are valid
Step 2: Test Email Delivery
If Step 1 doesn't resolve the issue:
Send a test email to yourself
Check spam/junk folders
Try sending to a different email provider (Gmail, Yahoo, etc.)
If test email works:
The issue is likely with the recipient's email
Ask them to check their spam folder
Verify the recipient email address is correct
If test email doesn't work:
Check error logs in Admin > System > Logs
Look for SMTP authentication errors
Contact your email provider about potential blocks
Step 3: Advanced Troubleshooting
Still having issues? Try these advanced steps:
Enable debug logging for email delivery
Check DNS records (SPF, DKIM, DMARC)
Test with a different SMTP provider
Contact support with error logs
`,
"wysiwyg_nightmare": `
Account Setup Guide
Welcome to our platform! Getting started is easy.
Important:Please read all instructions carefully.
Step 1: Create Your Account
Navigate to the registration page
Fill in your email address
Choose a strong password (minimum 8 characters)
Step 2: Verify Your Email
Check your inbox for a verification email. Note: It may take up to 5 minutes to arrive.
If you don't see it, check your spam folder.
"Make sure to complete verification within 24 hours, or you'll need to register again."
`,
"legal_wall_text": `
Terms of Service
These Terms of Service ("Terms") govern your use of our website and services. By accessing or using our services, you agree to be bound by these Terms. If you disagree with any part of these terms, then you may not access the service. This Terms of Service agreement for our service has been created with the help of legal counsel and covers all the important aspects of using our platform. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Continued use of the service after any such changes shall constitute your consent to such changes. You can review the most current version of the Terms of Service at any time by visiting this page. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Violation of any of the terms below will result in the termination of your account and your access to the service. While we prohibit such conduct and content on the service, you understand and agree that we cannot be responsible for the content posted on the service and you nonetheless may be exposed to such materials. You agree to use the service at your own risk.
You must be 13 years or older to use this service. You must be human and you must provide us with accurate information when you register for an account. Your login may only be used by one person and a single login shared by multiple people is not permitted. You are responsible for maintaining the security of your account and password. The company cannot and will not be liable for any loss or damage from your failure to comply with this security obligation. You are responsible for all content posted and all actions taken with your account. We reserve the right to refuse service to anyone for any reason at any time. We reserve the right to force forfeiture of any username that becomes inactive, violates trademark, or may mislead other users.
`,
"feature_comparison_table": `
Pricing Plans Comparison
Choose the plan that best fits your business needs:
Feature
Starter $29/month
Professional $79/month
Enterprise $199/month
Support Agents
Up to 3
Up to 10
Unlimited
Monthly Conversations
500
5,000
Unlimited
Email Support
✓
✓
✓
Live Chat Widget
✓
✓
✓
Knowledge Base
Basic
Advanced
Advanced + AI
Custom Branding
×
✓
✓
Advanced Analytics
×
✓
✓
API Access
×
Basic
Full Access
Priority Support
×
×
✓
Additional Features
All plans include: 24/7 uptime monitoring, SSL encryption, regular backups
Professional and Enterprise: Custom integrations, advanced workflows
This is not really a quote but we're using blockquote for styling
Prerequisites
This should be a heading but it's a div
Some normal paragraph text here.
Another Main Section
Wait, this should be an h2
Fake heading in a span
This content is in divs instead of paragraphs for some reason.
Another line in a div.
Finally a proper h2
Skipping h3 entirely
Table cell used as heading
Regular table content here
`,
"minimalist_haiku": `
Quick Start
Install
npm install
Configure
Edit config.json
Run
npm start
Test
npm test
Deploy
Push to production
Database
PostgreSQL
Cache
Redis
Storage
S3
Monitoring
Datadog
Logging
Sentry
`,
"release_notes": `
Release Notes
Version 2.1.0 - January 15, 2025
New Features
Added AI-powered response suggestions for agents
Implemented advanced search filters in conversation list
Added support for file attachments in live chat
New dashboard widgets for team performance metrics
Bug Fixes
Fixed email notifications not being sent for certain conversation states
Resolved timezone display issues in reporting
Fixed widget positioning on mobile devices
Version 2.0.3 - December 20, 2024
New Features
Added bulk actions for conversation management
Implemented custom fields for customer profiles
Added integration with Slack for team notifications
Bug Fixes
Fixed memory leak in WebSocket connections
Resolved search indexing issues with special characters
Fixed CSV export formatting problems
Version 2.0.2 - November 30, 2024
New Features
Added support for multiple languages in knowledge base
Implemented automated conversation routing based on keywords
Bug Fixes
Fixed authentication issues with SSO providers
Resolved performance issues with large conversation histories
`,
"image_heavy_guide": `
Setting Up Your Live Chat Widget
Follow these visual steps to add the chat widget to your website:
Step 1: Access Widget Settings
Navigate to Admin > Channels > Live Chat and click on your chat channel.
Step 2: Copy the Widget Code
In the Widget Code section, click the "Copy Code" button to copy the JavaScript snippet.
Step 3: Add Code to Your Website
Paste the code just before the closing </body> tag in your website's HTML.
Step 4: Test the Widget
Visit your website and verify the chat widget appears in the bottom right corner.
Step 5: Customize Appearance
Back in the admin panel, you can customize the widget's color, position, and welcome message.
`,
"nested_lists": `
10 Ways to Improve Customer Support
1. Response Time Optimization
Set clear response time expectations
Email: Within 4 hours during business hours
Live chat: Within 2 minutes
Phone: Answer within 3 rings
Use automation to acknowledge receipt
Auto-reply emails
Chat welcome messages
Ticket confirmation SMS
2. Knowledge Management
Create comprehensive FAQ sections
Common technical issues
Login problems
Password reset
Browser compatibility
Billing and account questions
Payment methods
Subscription changes
Refund policies
Maintain up-to-date documentation
Review quarterly
Update with new features
Remove outdated information
3. Team Training
Product knowledge training
Monthly product updates
Hands-on feature testing
Cross-departmental sessions
Communication skills development
Active listening techniques
Empathy building exercises
Conflict resolution strategies
`,
"faq_description_lists": `
Frequently Asked Questions
Find answers to common questions about our platform:
Account & Billing
How do I change my subscription plan?
You can upgrade or downgrade your plan at any time from your account settings. Navigate to Billing > Subscription and select your new plan. Changes take effect immediately for upgrades, or at the next billing cycle for downgrades.
Can I get a refund if I'm not satisfied?
Yes, we offer a 30-day money-back guarantee for all new subscriptions. Contact our support team within 30 days of your initial purchase for a full refund.
Do you offer annual billing discounts?
Absolutely! Annual subscriptions receive a 20% discount compared to monthly billing. You can switch to annual billing from your account settings at any time.
Technical Support
What browsers do you support?
Our platform works best with modern browsers including Chrome 90+, Firefox 88+, Safari 14+, and Edge 90+. We recommend keeping your browser updated for the best experience.
Is my data secure?
Yes, we take security seriously. All data is encrypted in transit and at rest using industry-standard encryption. We're SOC 2 compliant and undergo regular security audits.
Can I integrate with my existing tools?
We offer integrations with 100+ popular tools including Slack, Salesforce, HubSpot, Zapier, and more. Check our integrations page for a complete list, or use our REST API for custom integrations.
Getting Started
How long does setup take?
Most customers are up and running within 15 minutes. Our setup wizard guides you through the essential configuration steps, and you can always customize further later.
Do you provide onboarding assistance?
Yes! All paid plans include free onboarding support. We'll help you configure your account, import your data, and train your team. Enterprise customers get dedicated onboarding specialists.
`,
"kitchen_sink": `
Complete Getting Started Guide
Welcome to the most comprehensive guide for setting up your customer support platform. This guide covers everything you need to know.
Pro Tip: Bookmark this page for easy reference during setup!
After updating your configuration file, restart the application to apply the changes:
sudo systemctl restart libredesk
`,
}
}
func TestRealWorldScenarios(t *testing.T) {
samples := getRealWorldHTMLSamples()
config := newTestConfig(150, 50, 15) // Use smaller limits to trigger chunking with word-based tokenizer
testCases := []struct {
name string
htmlKey string
expectedMinChunks int
expectedMaxChunks int
validationCallback func(*testing.T, []HTMLChunk, string)
}{
{
name: "API Reference Manual",
htmlKey: "api_reference",
expectedMinChunks: 1,
expectedMaxChunks: 8,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should have code blocks and tables properly chunked
hasCodeChunk := false
hasTableChunk := false
for _, chunk := range chunks {
if chunk.HasCode {
hasCodeChunk = true
}
if chunk.HasTable {
hasTableChunk = true
}
}
assert.True(t, hasCodeChunk, "API reference should have at least one code chunk")
assert.True(t, hasTableChunk, "API reference should have at least one table chunk")
},
},
{
name: "Troubleshooting Guide",
htmlKey: "troubleshooting_guide",
expectedMinChunks: 2,
expectedMaxChunks: 6,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should chunk well with nested lists and headings
assert.True(t, len(chunks) >= 2, "Troubleshooting guide should split into multiple logical sections")
// Check that token distribution is reasonable
for i, chunk := range chunks {
tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
assert.True(t, tokens > 0, "Chunk %d should have content", i)
}
},
},
{
name: "WYSIWYG Nightmare",
htmlKey: "wysiwyg_nightmare",
expectedMinChunks: 1,
expectedMaxChunks: 4,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should handle messy HTML gracefully
assert.True(t, len(chunks) >= 1, "WYSIWYG content should create at least one chunk")
// Verify no chunks are empty after cleaning
for i, chunk := range chunks {
cleanText := HTML2Text(chunk.OriginalHTML)
assert.NotEmpty(t, strings.TrimSpace(cleanText), "Chunk %d should not be empty after HTML cleanup", i)
}
},
},
{
name: "Legal Wall of Text",
htmlKey: "legal_wall_text",
expectedMinChunks: 1,
expectedMaxChunks: 3,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should handle very long paragraphs by splitting appropriately
if len(chunks) > 1 {
for i, chunk := range chunks {
tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
// No chunk should be excessively large (allow some tolerance)
assert.True(t, tokens <= config.MaxTokens*2, "Chunk %d should not be excessively large (%d tokens)", i, tokens)
}
}
},
},
{
name: "Feature Comparison Table",
htmlKey: "feature_comparison_table",
expectedMinChunks: 1,
expectedMaxChunks: 4,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should preserve table structure
hasTable := false
for _, chunk := range chunks {
if chunk.HasTable {
hasTable = true
// Table chunk should contain table structure
assert.Contains(t, chunk.OriginalHTML, "
= 1, "Should handle poorly structured HTML")
// Verify chunker doesn't break on weird nesting
for i, chunk := range chunks {
assert.NotEmpty(t, chunk.OriginalHTML, "Chunk %d should have content", i)
}
},
},
{
name: "Minimalist Haiku",
htmlKey: "minimalist_haiku",
expectedMinChunks: 1,
expectedMaxChunks: 3,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should aggregate small sections appropriately
totalTokens := 0
for _, chunk := range chunks {
tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
totalTokens += tokens
}
// With many small sections, chunker should merge appropriately
assert.True(t, totalTokens > 0, "Should have some content")
},
},
{
name: "Release Notes",
htmlKey: "release_notes",
expectedMinChunks: 2,
expectedMaxChunks: 6,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should chunk by version sections
hasHeadings := false
for _, chunk := range chunks {
if chunk.HasHeading {
hasHeadings = true
}
}
assert.True(t, hasHeadings, "Release notes should maintain heading structure")
},
},
{
name: "Image Heavy Guide",
htmlKey: "image_heavy_guide",
expectedMinChunks: 2,
expectedMaxChunks: 8,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should chunk around steps with images
stepCount := 0
for _, chunk := range chunks {
if strings.Contains(chunk.OriginalHTML, "
Step") {
stepCount++
}
}
assert.True(t, stepCount >= 1, "Should preserve step-based structure")
},
},
{
name: "Nested Lists",
htmlKey: "nested_lists",
expectedMinChunks: 2,
expectedMaxChunks: 6,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should handle deeply nested lists without breaking hierarchy
for i, chunk := range chunks {
// Check that nested content makes sense
assert.NotEmpty(t, HTML2Text(chunk.OriginalHTML), "Chunk %d should have meaningful content", i)
}
},
},
{
name: "FAQ Description Lists",
htmlKey: "faq_description_lists",
expectedMinChunks: 2,
expectedMaxChunks: 6,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should handle dl/dt/dd structure appropriately
hasDescriptionList := false
for _, chunk := range chunks {
if strings.Contains(chunk.OriginalHTML, "
") {
hasDescriptionList = true
}
}
assert.True(t, hasDescriptionList, "Should preserve description list structure")
},
},
{
name: "Kitchen Sink",
htmlKey: "kitchen_sink",
expectedMinChunks: 3,
expectedMaxChunks: 10,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should handle all element types
hasHeading := false
hasCode := false
hasTable := false
for _, chunk := range chunks {
if chunk.HasHeading {
hasHeading = true
}
if chunk.HasCode {
hasCode = true
}
if chunk.HasTable {
hasTable = true
}
}
assert.True(t, hasHeading, "Kitchen sink should have headings")
assert.True(t, hasCode, "Kitchen sink should have code")
assert.True(t, hasTable, "Kitchen sink should have tables")
},
},
{
name: "Markdown Import",
htmlKey: "markdown_import",
expectedMinChunks: 2,
expectedMaxChunks: 8,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should handle clean markdown-generated HTML
hasCode := false
for _, chunk := range chunks {
if chunk.HasCode {
hasCode = true
}
}
assert.True(t, hasCode, "Markdown import should preserve code blocks")
},
},
{
name: "Interactive Transcript",
htmlKey: "interactive_transcript",
expectedMinChunks: 2,
expectedMaxChunks: 6,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should handle custom div elements with data attributes
hasCustomDivs := false
for _, chunk := range chunks {
if strings.Contains(chunk.OriginalHTML, "data-type=") {
hasCustomDivs = true
}
}
assert.True(t, hasCustomDivs, "Should preserve custom interactive elements")
},
},
{
name: "Giant Code Block",
htmlKey: "giant_code_block",
expectedMinChunks: 1,
expectedMaxChunks: 3,
validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
// Should truncate oversized code blocks to respect max tokens
hasCodeBlocks := false
for _, chunk := range chunks {
if chunk.HasCode {
hasCodeBlocks = true
tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
// Each chunk should respect max token limit (150 in test config)
assert.LessOrEqual(t, tokens, 150, "Code block chunk should not exceed max tokens after truncation")
t.Logf("Code block chunk has %d tokens", tokens)
}
}
assert.True(t, hasCodeBlocks, "Should have code blocks")
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
html, exists := samples[tc.htmlKey]
require.True(t, exists, "Test HTML sample %s should exist", tc.htmlKey)
chunks, err := ChunkHTMLContent(tc.name, html, config)
require.NoError(t, err, "Chunking should not fail for %s", tc.name)
// Basic validation
assert.GreaterOrEqual(t, len(chunks), tc.expectedMinChunks,
"Should have at least %d chunks for %s", tc.expectedMinChunks, tc.name)
assert.LessOrEqual(t, len(chunks), tc.expectedMaxChunks,
"Should have at most %d chunks for %s", tc.expectedMaxChunks, tc.name)
// Verify chunk metadata
for i, chunk := range chunks {
assert.Equal(t, i, chunk.ChunkIndex, "Chunk index should be correct")
assert.Equal(t, len(chunks), chunk.TotalChunks, "Total chunks should be correct")
assert.NotEmpty(t, chunk.Text, "Chunk text should not be empty")
assert.Contains(t, chunk.Text, tc.name, "Chunk should contain title")
}
// Token distribution validation
totalTokens := 0
for i, chunk := range chunks {
tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
totalTokens += tokens
// Log token distribution for analysis
t.Logf("Chunk %d: %d tokens", i, tokens)
}
// Scenario-specific validation
if tc.validationCallback != nil {
tc.validationCallback(t, chunks, tc.name)
}
t.Logf("Scenario '%s': %d chunks, %d total tokens", tc.name, len(chunks), totalTokens)
})
}
}
// TestChunkHTMLContent_ConfigurableTokenLimits tests that custom token limits work correctly
func TestChunkHTMLContent_ConfigurableTokenLimits(t *testing.T) {
// Large HTML content with table and code
largeHTML := `
API Documentation
This is a comprehensive guide to our API endpoints with detailed examples.
Endpoint
Method
Description
Parameters
Response
/api/users
GET
Get all users
page, limit
JSON array of users
/api/users/{id}
GET
Get user by ID
id (path)
JSON user object
/api/users
POST
Create new user
name, email, role
Created user object
/api/users/{id}
PUT
Update user
id (path), name, email, role
Updated user object
/api/users/{id}
DELETE
Delete user
id (path)
Success message
Authentication
All API endpoints require authentication using JWT tokens in the Authorization header.
curl -H "Authorization: Bearer YOUR_TOKEN" -X GET https://api.example.com/users
Additional content to make this chunk larger and test the token limits effectively.
`
// Test with default config (smaller chunks)
defaultChunks, err := ChunkHTMLContent("API Guide", largeHTML)
require.NoError(t, err)
// Test with larger token config (should create fewer, larger chunks)
largeConfig := ChunkConfig{
MaxTokens: 2000, // Much larger than default 700
MinTokens: 400, // Larger than default 200
OverlapTokens: 150, // Larger than default 75
TokenizerFunc: simpleTokenizer,
PreserveBlocks: []string{"pre", "code", "table"},
}
largeChunks, err := ChunkHTMLContent("API Guide", largeHTML, largeConfig)
require.NoError(t, err)
// Verify that larger config creates fewer chunks
assert.True(t, len(largeChunks) <= len(defaultChunks),
"Larger token config should create fewer or equal chunks. Default: %d, Large: %d",
len(defaultChunks), len(largeChunks))
// Verify that chunks contain expected metadata
for _, chunk := range largeChunks {
tokens, ok := chunk.Metadata["tokens"].(int)
assert.True(t, ok, "Chunk should have token count in metadata")
assert.True(t, tokens > 0, "Token count should be positive")
// Check that we're respecting the larger config
if tokens > 700 { // Old default limit
t.Logf("✅ Large chunk with %d tokens (exceeds old 700 limit)", tokens)
}
}
t.Logf("Default config: %d chunks, Large config: %d chunks", len(defaultChunks), len(largeChunks))
}