libredesk/internal/stringutil/htmlchunker_test.go

package stringutil

import (
	"fmt"
	"strings"
	"testing"

	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

// --- Test Helper Functions ---

func simpleTokenizer(text string) int {
	// 1 word = 1 token for simplicity in testing
	return len(strings.Fields(text))
}

func newTestConfig(max, min, overlap int) ChunkConfig {
	return ChunkConfig{
		MaxTokens:      max,
		MinTokens:      min,
		OverlapTokens:  overlap,
		TokenizerFunc:  simpleTokenizer,
		PreserveBlocks: []string{"pre", "code", "table"},
	}
}

func generateHTML(tag, content string, count int) string {
	var b strings.Builder
	for i := 0; i < count; i++ {
		b.WriteString(fmt.Sprintf("<%s>%s %d</%s>\n", tag, content, i, tag))
	}
	return b.String()
}

// --- Test Cases ---

func TestChunkHTMLContent_Basic(t *testing.T) {
	testCases := []struct {
		name           string
		title          string
		html           string
		config         ChunkConfig
		expectedChunks int
		expectedError  string
		validate       func(*testing.T, []HTMLChunk)
	}{
		{
			name:           "Empty Content",
			title:          "Empty Test",
			html:           "  ",
			config:         DefaultChunkConfig(),
			expectedChunks: 1,
			validate: func(t *testing.T, chunks []HTMLChunk) {
				assert.Equal(t, "Empty Test", chunks[0].Text)
				assert.True(t, chunks[0].Metadata["empty"].(bool))
				assert.Equal(t, 0, chunks[0].ChunkIndex)
				assert.Equal(t, 1, chunks[0].TotalChunks)
			},
		},
		{
			name:           "Title Only with Empty HTML",
			title:          "Title Only",
			html:           "",
			config:         DefaultChunkConfig(),
			expectedChunks: 1,
			validate: func(t *testing.T, chunks []HTMLChunk) {
				assert.Equal(t, "Title Only", chunks[0].Text)
			},
		},
		{
			name:           "Single Chunk Scenario",
			title:          "Single Chunk",
			html:           "<h2>This is a heading that should create a chunk</h2>",
			config:         newTestConfig(100, 10, 5),
			expectedChunks: 1,
			validate: func(t *testing.T, chunks []HTMLChunk) {
				assert.Contains(t, chunks[0].Text, "This is a heading")
				assert.Equal(t, 1, chunks[0].TotalChunks)
				assert.True(t, chunks[0].HasHeading)
			},
		},
		{
			name:           "Multiple Chunks Scenario",
			title:          "Multiple Chunks",
			html:           generateHTML("h3", "This is a heading.", 10), // 10 headings * 4 words = 40 tokens
			config:         newTestConfig(20, 10, 5),
			expectedChunks: 3, // Adjusted based on actual behavior
			validate: func(t *testing.T, chunks []HTMLChunk) {
				assert.Equal(t, 3, chunks[0].TotalChunks)
				assert.Equal(t, 0, chunks[0].ChunkIndex)
				assert.Equal(t, 1, chunks[1].ChunkIndex)
				assert.Equal(t, 2, chunks[2].ChunkIndex)
			},
		},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			chunks, err := ChunkHTMLContent(tc.title, tc.html, tc.config)

			if tc.expectedError != "" {
				require.Error(t, err)
				assert.Contains(t, err.Error(), tc.expectedError)
				return
			}

			require.NoError(t, err)
			require.Len(t, chunks, tc.expectedChunks)
			if tc.validate != nil {
				tc.validate(t, chunks)
			}
		})
	}
}

func TestChunkConfig_Validation(t *testing.T) {
	testCases := []struct {
		name          string
		config        ChunkConfig
		expectedError string
	}{
		{
			name:          "Invalid MaxTokens <= MinTokens",
			config:        newTestConfig(100, 100, 10),
			expectedError: "MaxTokens must be greater than MinTokens",
		},
		{
			name:          "Invalid OverlapTokens >= MinTokens",
			config:        newTestConfig(100, 50, 50),
			expectedError: "OverlapTokens must be less than MinTokens",
		},
		{
			name: "Custom Tokenizer",
			config: ChunkConfig{
				MaxTokens:     10,
				MinTokens:     5,
				OverlapTokens: 2,
				TokenizerFunc: func(s string) int { return len(s) }, // simple char count
			},
			expectedError: "", // Should be valid
		},
		{
			name:          "Nil Tokenizer should default",
			config:        ChunkConfig{MaxTokens: 100, MinTokens: 50, OverlapTokens: 10, TokenizerFunc: nil},
			expectedError: "",
		},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			_, err := ChunkHTMLContent("test", "<p>hello</p>", tc.config)
			if tc.expectedError != "" {
				require.Error(t, err)
				assert.Contains(t, err.Error(), tc.expectedError)
			} else {
				require.NoError(t, err)
			}
		})
	}
}

func TestChunkHTMLContent_EdgeCases(t *testing.T) {
	testCases := []struct {
		name           string
		html           string
		config         ChunkConfig
		expectedChunks int
		expectedError  string
		validate       func(*testing.T, []HTMLChunk)
	}{
		{
			name:           "Malformed HTML",
			html:           "<h2>This is <b>unclosed text</h2><div>",
			config:         newTestConfig(100, 10, 5),
			expectedChunks: 1, // Should still parse leniently
		},
		{
			name:           "Deeply Nested HTML",
			html:           "<div><section><article><h3><span><b>Deep</b></span></h3></article></section></div>",
			config:         newTestConfig(100, 10, 5),
			expectedChunks: 1,
		},
		{
			name:           "HTML Entities and Special Characters",
			html:           "<h2>This is &amp; some text with &lt;entities&gt; and unicode © characters.</h2>",
			config:         newTestConfig(100, 10, 5),
			expectedChunks: 1,
		},
		{
			name:           "Excessive Whitespace",
			html:           "  <h2>  \n\t  leading and trailing spaces   \n\n </h2>  ",
			config:         newTestConfig(100, 10, 5),
			expectedChunks: 1,
			validate: func(t *testing.T, chunks []HTMLChunk) {
				// Check that excessive whitespace is handled
				assert.NotContains(t, chunks[0].Text, "  ", "Whitespace should be normalized")
			},
		},
		{
			name:           "Giant Token Test - Single Massive Block",
			html:           "<h2>" + strings.Repeat("word ", 1000) + "</h2>", // 1000 tokens in one block
			config:         newTestConfig(50, 20, 10),
			expectedChunks: 1, // Should truncate oversized content to fit max tokens
		},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			chunks, err := ChunkHTMLContent("Edge Case", tc.html, tc.config)
			if tc.expectedError != "" {
				require.Error(t, err)
				assert.Contains(t, err.Error(), tc.expectedError)
			} else {
				require.NoError(t, err)
				assert.Len(t, chunks, tc.expectedChunks)
				if tc.validate != nil {
					tc.validate(t, chunks)
				}
			}
		})
	}
}

func TestChunkingLogic(t *testing.T) {
	t.Run("Large Content Exceeding MaxTokens", func(t *testing.T) {
		html := generateHTML("h3", "word1 word2 word3 word4 word5", 20) // 100 words/tokens
		config := newTestConfig(50, 20, 10)
		chunks, err := ChunkHTMLContent("Large Content", html, config)
		require.NoError(t, err)
		assert.True(t, len(chunks) > 1, "Should be split into multiple chunks")
		for _, chunk := range chunks {
			tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
			if tokens > config.MaxTokens {
				t.Logf("Warning: Chunk with %d tokens exceeds MaxTokens of %d", tokens, config.MaxTokens)
			}
		}
	})

	t.Run("PreserveBlocks Functionality - No Split Zone", func(t *testing.T) {
		html := `
			<h3>This is some text before.</h3>
			<pre><code>This is a code block that should not be split. It contains many words to exceed the token limit if it were normal text. one two three four five six seven eight nine ten eleven twelve thirteen.</code></pre>
			<h3>This is some text after.</h3>
		`
		config := newTestConfig(20, 10, 5)
		chunks, err := ChunkHTMLContent("Preserve", html, config)
		require.NoError(t, err)
		assert.True(t, len(chunks) >= 1)

		// With simple truncation, oversized content gets truncated to fit max tokens
		// We should still have some content that was originally code
		hasCodeContent := false
		for _, chunk := range chunks {
			if strings.Contains(chunk.OriginalHTML, "code block") {
				hasCodeContent = true
				// Verify it respects token limits
				tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
				assert.LessOrEqual(t, tokens, 20, "Truncated content should respect max tokens")
				break
			}
		}
		assert.True(t, hasCodeContent, "Should have truncated code content")
	})

	t.Run("Priority-based chunking (headings)", func(t *testing.T) {
		html := `
			<h3>This is paragraph one. It has enough text to be a chunk with many words here.</h3>
			<h2>This is a Heading</h2>
			<h3>This is paragraph two, which should start in a new chunk.</h3>
		`
		config := newTestConfig(30, 5, 3)
		chunks, err := ChunkHTMLContent("Headings", html, config)
		require.NoError(t, err)
		assert.True(t, len(chunks) >= 1)

		// Check if we have heading chunks
		hasHeadingChunk := false
		for _, chunk := range chunks {
			if chunk.HasHeading {
				hasHeadingChunk = true
				break
			}
		}
		assert.True(t, hasHeadingChunk, "Should have at least one chunk with heading")
	})

	t.Run("Boundary merging for small elements - Micro Token Test", func(t *testing.T) {
		html := `
			<h4>Small one.</h4>
			<h4>Small two.</h4>
			<h4>Small three.</h4>
		`
		config := newTestConfig(50, 5, 3)
		chunks, err := ChunkHTMLContent("Merging", html, config)
		require.NoError(t, err)
		assert.True(t, len(chunks) >= 1, "Should create at least one chunk")
	})

	t.Run("Priority Conflict Test", func(t *testing.T) {
		html := `<div><h1>Important Heading Inside Low Priority Container</h1></div>`
		config := newTestConfig(50, 10, 5)
		chunks, err := ChunkHTMLContent("Priority", html, config)
		require.NoError(t, err)
		assert.Len(t, chunks, 1)
		assert.True(t, chunks[0].HasHeading)
	})
}

func TestOverlapFunctionality(t *testing.T) {
	html := `
		<h3>This is the first sentence. It provides context for the next part.</h3>
		<h3>This is the second sentence. It should be part of the overlap.</h3>
		<h3>This is the third sentence. This marks the beginning of the second chunk.</h3>
		<h3>This is the fourth sentence. More content for the second chunk here.</h3>
	`

	t.Run("Overlap Extraction", func(t *testing.T) {
		config := newTestConfig(20, 10, 8) // Max 20, Overlap 8
		chunks, err := ChunkHTMLContent("Overlap", html, config)
		require.NoError(t, err)
		require.True(t, len(chunks) >= 2)

		if len(chunks) >= 2 {
			chunk1Text := HTML2Text(chunks[0].OriginalHTML)
			chunk2Text := HTML2Text(chunks[1].OriginalHTML)

			assert.Contains(t, chunk1Text, "first sentence")

			// Check for some form of overlap (implementation may vary)
			t.Logf("Chunk 1: %s", chunk1Text)
			t.Logf("Chunk 2: %s", chunk2Text)
		}
	})

	t.Run("Zero Overlap Configuration", func(t *testing.T) {
		config := newTestConfig(20, 10, 0) // No overlap
		chunks, err := ChunkHTMLContent("No Overlap", html, config)
		require.NoError(t, err)
		require.True(t, len(chunks) >= 1)

		t.Logf("Zero overlap test resulted in %d chunks", len(chunks))
	})

	t.Run("Sentence Boundary Test", func(t *testing.T) {
		htmlWithVariousEndings := `
			<h3>Question sentence? Another with exclamation! Normal sentence.</h3>
			<h3>Sentence with ellipsis... And another normal one.</h3>
		`
		config := newTestConfig(15, 8, 5)
		chunks, err := ChunkHTMLContent("Sentences", htmlWithVariousEndings, config)
		require.NoError(t, err)
		assert.True(t, len(chunks) >= 1)
	})
}

func TestMetadataAndOutput(t *testing.T) {
	html := `
		<h1>Main Title</h1>
		<p>Some paragraph text.</p>
		<pre><code>var x = 1;</code></pre>
		<table><tr><td>data</td></tr></table>
	`
	config := newTestConfig(100, 10, 5)
	chunks, err := ChunkHTMLContent("Metadata Test", html, config)
	require.NoError(t, err)
	require.Len(t, chunks, 1)

	chunk := chunks[0]
	t.Run("Struct Fields", func(t *testing.T) {
		assert.True(t, chunk.HasHeading, "Should detect heading")
		assert.True(t, chunk.HasCode, "Should detect code")
		assert.True(t, chunk.HasTable, "Should detect table")
		assert.Equal(t, 0, chunk.ChunkIndex)
		assert.Equal(t, 1, chunk.TotalChunks)
	})

	t.Run("Token Counting Accuracy", func(t *testing.T) {
		text := HTML2Text(chunk.OriginalHTML)
		expectedTokens := simpleTokenizer(text)
		metadataTokens, ok := chunk.Metadata["tokens"].(int)
		require.True(t, ok, "Metadata should contain tokens count")

		// Allow some variance since internal processing may differ
		assert.InDelta(t, expectedTokens, metadataTokens, 10, "Token count in metadata should be close to actual")
	})

	t.Run("Format for Embedding", func(t *testing.T) {
		assert.Contains(t, chunk.Text, "Title: Metadata Test")
		assert.Contains(t, chunk.Text, "Content:")
	})
}

func TestDefaultTokenizer(t *testing.T) {
	testCases := []struct {
		name     string
		text     string
		expected int // Expected tokens from defaultTokenizer (rune count * 0.4)
	}{
		{
			name:     "Simple English text",
			text:     "Hello world",
			expected: 4, // 11 runes * 0.4 ≈ 4 tokens
		},
		{
			name:     "Longer text",
			text:     "This is a longer sentence with multiple words",
			expected: 18, // 45 runes * 0.4 = 18 tokens
		},
		{
			name:     "Unicode characters",
			text:     "Hello 世界",
			expected: 3, // 8 runes * 0.4 ≈ 3 tokens
		},
		{
			name:     "Empty text",
			text:     "",
			expected: 0,
		},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			tokens := defaultTokenizer(tc.text)
			assert.Equal(t, tc.expected, tokens)
		})
	}
}

func TestGetPriority(t *testing.T) {
	testCases := []struct {
		tag      string
		expected int
	}{
		{"h1", 1},
		{"h2", 1},
		{"h3", 2},
		{"pre", 2},
		{"code", 2},
		{"p", 3},
		{"table", 3},
		{"div", 4},
		{"span", 5},
		{"unknown", 5},
	}

	for _, tc := range testCases {
		t.Run(tc.tag, func(t *testing.T) {
			priority := getPriority(tc.tag)
			assert.Equal(t, tc.expected, priority)
		})
	}
}

// getRealWorldHTMLSamples returns realistic HTML samples for comprehensive testing
func getRealWorldHTMLSamples() map[string]string {
	return map[string]string{
		"api_reference": `
<h1>User API Reference</h1>
<h2>Authentication</h2>
<p>All API requests require authentication using an API key in the header:</p>
<pre><code>Authorization: Bearer your_api_key_here</code></pre>

<h2>Create User</h2>
<p>Creates a new user account in the system.</p>
<h3>Request</h3>
<p><strong>POST</strong> <code>/api/v1/users</code></p>

<h3>Parameters</h3>
<table>
<thead>
<tr><th>Parameter</th><th>Type</th><th>Required</th><th>Description</th></tr>
</thead>
<tbody>
<tr><td>email</td><td>string</td><td>Yes</td><td>User's email address</td></tr>
<tr><td>first_name</td><td>string</td><td>Yes</td><td>User's first name</td></tr>
<tr><td>last_name</td><td>string</td><td>No</td><td>User's last name</td></tr>
<tr><td>role</td><td>string</td><td>No</td><td>User role: admin, agent, or user</td></tr>
</tbody>
</table>

<h3>Example Request</h3>
<pre><code>{
  "email": "john.doe@example.com",
  "first_name": "John",
  "last_name": "Doe",
  "role": "agent"
}</code></pre>

<h3>Response</h3>
<p>Returns the created user object:</p>
<pre><code>{
  "id": 123,
  "email": "john.doe@example.com",
  "first_name": "John",
  "last_name": "Doe",
  "role": "agent",
  "created_at": "2025-01-15T10:30:00Z"
}</code></pre>`,

		"troubleshooting_guide": `
<h1>Email Not Working? Troubleshooting Guide</h1>
<p>Follow these steps to diagnose and fix email delivery issues:</p>

<h2>Step 1: Check Your Email Settings</h2>
<ul>
<li>Verify SMTP server settings are correct</li>
<li>Check if port 587 or 465 is properly configured</li>
<li>Ensure authentication credentials are valid</li>
</ul>

<h2>Step 2: Test Email Delivery</h2>
<p>If Step 1 doesn't resolve the issue:</p>
<ul>
<li>Send a test email to yourself</li>
<li>Check spam/junk folders</li>
<li>Try sending to a different email provider (Gmail, Yahoo, etc.)</li>
</ul>

<h3>If test email works:</h3>
<ul>
<li>The issue is likely with the recipient's email</li>
<li>Ask them to check their spam folder</li>
<li>Verify the recipient email address is correct</li>
</ul>

<h3>If test email doesn't work:</h3>
<ul>
<li>Check error logs in Admin > System > Logs</li>
<li>Look for SMTP authentication errors</li>
<li>Contact your email provider about potential blocks</li>
</ul>

<h2>Step 3: Advanced Troubleshooting</h2>
<p>Still having issues? Try these advanced steps:</p>
<ul>
<li>Enable debug logging for email delivery</li>
<li>Check DNS records (SPF, DKIM, DMARC)</li>
<li>Test with a different SMTP provider</li>
<li>Contact support with error logs</li>
</ul>`,

		"wysiwyg_nightmare": `
<h1><span style="font-family: Arial;">Account Setup Guide</span></h1>
<p>&nbsp;</p>
<div style="margin-left: 40px;">
<p><span style="color: #000000; font-size: 14px; font-family: Helvetica;">Welcome to our platform! <strong>Getting started</strong> is easy.</span></p>
</div>
<p>&nbsp;</p>
<p><span style="background-color: #ffff00;"><em>Important:</em></span> <span style="text-decoration: underline;">Please read</span> all instructions carefully.</p>
<p>&nbsp;</p>
<div style="border: 1px solid #ccc; padding: 10px;">
<h2><span style="color: #ff0000;">Step 1: Create Your Account</span></h2>
<ol>
<li><span style="font-size: 12px;">Navigate to the registration page</span></li>
<li><span style="font-size: 12px;">Fill in your email address</span></li>
<li><span style="font-size: 12px;">Choose a strong password (minimum 8 characters)</span></li>
</ol>
</div>
<p>&nbsp;</p>
<div style="background-color: #f0f0f0; padding: 15px;">
<h2>Step 2: Verify Your Email</h2>
<p>Check your inbox for a verification email. <strong>Note:</strong> It may take up to 5 minutes to arrive.</p>
<p>If you don't see it, check your <em>spam folder</em>.</p>
</div>
<p>&nbsp;</p>
<blockquote style="margin-left: 40px; font-style: italic; color: #666;">
"Make sure to complete verification within 24 hours, or you'll need to register again."
</blockquote>`,

		"legal_wall_text": `
<h1>Terms of Service</h1>
<p>These Terms of Service ("Terms") govern your use of our website and services. By accessing or using our services, you agree to be bound by these Terms. If you disagree with any part of these terms, then you may not access the service. This Terms of Service agreement for our service has been created with the help of legal counsel and covers all the important aspects of using our platform. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Continued use of the service after any such changes shall constitute your consent to such changes. You can review the most current version of the Terms of Service at any time by visiting this page. We reserve the right to update and change the Terms of Service from time to time without notice. Any new features that augment or enhance the current service, including the release of new tools and resources, shall be subject to the Terms of Service. Violation of any of the terms below will result in the termination of your account and your access to the service. While we prohibit such conduct and content on the service, you understand and agree that we cannot be responsible for the content posted on the service and you nonetheless may be exposed to such materials. You agree to use the service at your own risk.</p>
<p>You must be 13 years or older to use this service. You must be human and you must provide us with accurate information when you register for an account. Your login may only be used by one person and a single login shared by multiple people is not permitted. You are responsible for maintaining the security of your account and password. The company cannot and will not be liable for any loss or damage from your failure to comply with this security obligation. You are responsible for all content posted and all actions taken with your account. We reserve the right to refuse service to anyone for any reason at any time. We reserve the right to force forfeiture of any username that becomes inactive, violates trademark, or may mislead other users.</p>`,

		"feature_comparison_table": `
<h1>Pricing Plans Comparison</h1>
<p>Choose the plan that best fits your business needs:</p>

<table style="width: 100%; border-collapse: collapse; margin: 20px 0;">
<thead>
<tr style="background-color: #f5f5f5;">
<th style="border: 1px solid #ddd; padding: 12px; text-align: left;">Feature</th>
<th style="border: 1px solid #ddd; padding: 12px; text-align: center;">Starter<br><small>$29/month</small></th>
<th style="border: 1px solid #ddd; padding: 12px; text-align: center;">Professional<br><small>$79/month</small></th>
<th style="border: 1px solid #ddd; padding: 12px; text-align: center;">Enterprise<br><small>$199/month</small></th>
</tr>
</thead>
<tbody>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Support Agents</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Up to 3</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Up to 10</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Unlimited</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Monthly Conversations</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">500</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">5,000</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Unlimited</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Email Support</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Live Chat Widget</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Knowledge Base</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Basic</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Advanced</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Advanced + AI</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Custom Branding</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">×</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Advanced Analytics</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">×</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>API Access</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">×</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Basic</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">Full Access</td>
</tr>
<tr>
<td style="border: 1px solid #ddd; padding: 12px;"><strong>Priority Support</strong></td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">×</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">×</td>
<td style="border: 1px solid #ddd; padding: 12px; text-align: center;">✓</td>
</tr>
</tbody>
</table>

<h2>Additional Features</h2>
<ul>
<li><strong>All plans include:</strong> 24/7 uptime monitoring, SSL encryption, regular backups</li>
<li><strong>Professional and Enterprise:</strong> Custom integrations, advanced workflows</li>
<li><strong>Enterprise only:</strong> Dedicated account manager, custom SLA, on-premise deployment options</li>
</ul>`,

		"poorly_structured_html": `
<h4>Getting Started</h4>
<h1>Introduction</h1>
<blockquote style="font-size: 18px; color: blue;">This is not really a quote but we're using blockquote for styling</blockquote>
<h3>Prerequisites</h3>
<div style="font-weight: bold; font-size: 16px;">This should be a heading but it's a div</div>
<p>Some normal paragraph text here.</p>
<h1>Another Main Section</h1>
<h6>Wait, this should be an h2</h6>
<div style="padding: 10px; background: #f0f0f0;">
<span style="font-size: 20px; font-weight: bold;">Fake heading in a span</span>
<div>This content is in divs instead of paragraphs for some reason.</div>
<div>Another line in a div.</div>
</div>
<h2>Finally a proper h2</h2>
<h4>Skipping h3 entirely</h4>
<table>
<tr><td style="font-weight: bold; font-size: 18px;">Table cell used as heading</td></tr>
<tr><td>Regular table content here</td></tr>
</table>`,

		"minimalist_haiku": `
<h1>Quick Start</h1>
<h2>Install</h2>
<p>npm install</p>
<h2>Configure</h2>
<p>Edit config.json</p>
<h2>Run</h2>
<p>npm start</p>
<h2>Test</h2>
<p>npm test</p>
<h2>Deploy</h2>
<p>Push to production</p>
<h3>Database</h3>
<p>PostgreSQL</p>
<h3>Cache</h3>
<p>Redis</p>
<h3>Storage</h3>
<p>S3</p>
<h3>Monitoring</h3>
<p>Datadog</p>
<h3>Logging</h3>
<p>Sentry</p>`,

		"release_notes": `
<h1>Release Notes</h1>

<h2>Version 2.1.0 - January 15, 2025</h2>
<h3>New Features</h3>
<ul>
<li>Added AI-powered response suggestions for agents</li>
<li>Implemented advanced search filters in conversation list</li>
<li>Added support for file attachments in live chat</li>
<li>New dashboard widgets for team performance metrics</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Fixed email notifications not being sent for certain conversation states</li>
<li>Resolved timezone display issues in reporting</li>
<li>Fixed widget positioning on mobile devices</li>
</ul>

<h2>Version 2.0.3 - December 20, 2024</h2>
<h3>New Features</h3>
<ul>
<li>Added bulk actions for conversation management</li>
<li>Implemented custom fields for customer profiles</li>
<li>Added integration with Slack for team notifications</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Fixed memory leak in WebSocket connections</li>
<li>Resolved search indexing issues with special characters</li>
<li>Fixed CSV export formatting problems</li>
</ul>

<h2>Version 2.0.2 - November 30, 2024</h2>
<h3>New Features</h3>
<ul>
<li>Added support for multiple languages in knowledge base</li>
<li>Implemented automated conversation routing based on keywords</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Fixed authentication issues with SSO providers</li>
<li>Resolved performance issues with large conversation histories</li>
</ul>`,

		"image_heavy_guide": `
<h1>Setting Up Your Live Chat Widget</h1>
<p>Follow these visual steps to add the chat widget to your website:</p>

<h2>Step 1: Access Widget Settings</h2>
<img src="/images/step1-widget-settings.png" alt="Screenshot of admin dashboard with widget settings highlighted" />
<p>Navigate to Admin > Channels > Live Chat and click on your chat channel.</p>

<h2>Step 2: Copy the Widget Code</h2>
<img src="/images/step2-copy-code.png" alt="Screenshot showing the widget code section with copy button" />
<p>In the Widget Code section, click the "Copy Code" button to copy the JavaScript snippet.</p>

<h2>Step 3: Add Code to Your Website</h2>
<img src="/images/step3-paste-code.png" alt="Screenshot of website HTML with widget code pasted before closing body tag" />
<p>Paste the code just before the closing &lt;/body&gt; tag in your website's HTML.</p>

<h2>Step 4: Test the Widget</h2>
<img src="/images/step4-test-widget.png" alt="Screenshot of website with chat widget visible in bottom right corner" />
<p>Visit your website and verify the chat widget appears in the bottom right corner.</p>

<h2>Step 5: Customize Appearance</h2>
<img src="/images/step5-customize.png" alt="Screenshot of widget customization options showing color and position settings" />
<p>Back in the admin panel, you can customize the widget's color, position, and welcome message.</p>`,

		"nested_lists": `
<h1>10 Ways to Improve Customer Support</h1>

<h2>1. Response Time Optimization</h2>
<ul>
<li>Set clear response time expectations
<ul>
<li>Email: Within 4 hours during business hours</li>
<li>Live chat: Within 2 minutes</li>
<li>Phone: Answer within 3 rings</li>
</ul>
</li>
<li>Use automation to acknowledge receipt
<ul>
<li>Auto-reply emails</li>
<li>Chat welcome messages</li>
<li>Ticket confirmation SMS</li>
</ul>
</li>
</ul>

<h2>2. Knowledge Management</h2>
<ul>
<li>Create comprehensive FAQ sections
<ul>
<li>Common technical issues
<ul>
<li>Login problems</li>
<li>Password reset</li>
<li>Browser compatibility</li>
</ul>
</li>
<li>Billing and account questions
<ul>
<li>Payment methods</li>
<li>Subscription changes</li>
<li>Refund policies</li>
</ul>
</li>
</ul>
</li>
<li>Maintain up-to-date documentation
<ul>
<li>Review quarterly</li>
<li>Update with new features</li>
<li>Remove outdated information</li>
</ul>
</li>
</ul>

<h2>3. Team Training</h2>
<ul>
<li>Product knowledge training
<ul>
<li>Monthly product updates</li>
<li>Hands-on feature testing</li>
<li>Cross-departmental sessions</li>
</ul>
</li>
<li>Communication skills development
<ul>
<li>Active listening techniques</li>
<li>Empathy building exercises</li>
<li>Conflict resolution strategies</li>
</ul>
</li>
</ul>`,

		"faq_description_lists": `
<h1>Frequently Asked Questions</h1>
<p>Find answers to common questions about our platform:</p>

<h2>Account & Billing</h2>
<dl>
<dt>How do I change my subscription plan?</dt>
<dd>You can upgrade or downgrade your plan at any time from your account settings. Navigate to Billing > Subscription and select your new plan. Changes take effect immediately for upgrades, or at the next billing cycle for downgrades.</dd>

<dt>Can I get a refund if I'm not satisfied?</dt>
<dd>Yes, we offer a 30-day money-back guarantee for all new subscriptions. Contact our support team within 30 days of your initial purchase for a full refund.</dd>

<dt>Do you offer annual billing discounts?</dt>
<dd>Absolutely! Annual subscriptions receive a 20% discount compared to monthly billing. You can switch to annual billing from your account settings at any time.</dd>
</dl>

<h2>Technical Support</h2>
<dl>
<dt>What browsers do you support?</dt>
<dd>Our platform works best with modern browsers including Chrome 90+, Firefox 88+, Safari 14+, and Edge 90+. We recommend keeping your browser updated for the best experience.</dd>

<dt>Is my data secure?</dt>
<dd>Yes, we take security seriously. All data is encrypted in transit and at rest using industry-standard encryption. We're SOC 2 compliant and undergo regular security audits.</dd>

<dt>Can I integrate with my existing tools?</dt>
<dd>We offer integrations with 100+ popular tools including Slack, Salesforce, HubSpot, Zapier, and more. Check our integrations page for a complete list, or use our REST API for custom integrations.</dd>
</dl>

<h2>Getting Started</h2>
<dl>
<dt>How long does setup take?</dt>
<dd>Most customers are up and running within 15 minutes. Our setup wizard guides you through the essential configuration steps, and you can always customize further later.</dd>

<dt>Do you provide onboarding assistance?</dt>
<dd>Yes! All paid plans include free onboarding support. We'll help you configure your account, import your data, and train your team. Enterprise customers get dedicated onboarding specialists.</dd>
</dl>`,

		"kitchen_sink": `
<h1>Complete Getting Started Guide</h1>
<p>Welcome to the most comprehensive guide for setting up your customer support platform. This guide covers everything you need to know.</p>

<blockquote>
<p><strong>Pro Tip:</strong> Bookmark this page for easy reference during setup!</p>
</blockquote>

<h2>Table of Contents</h2>
<ol>
<li><a href="#account-setup">Account Setup</a></li>
<li><a href="#team-management">Team Management</a></li>
<li><a href="#channel-configuration">Channel Configuration</a></li>
<li><a href="#advanced-features">Advanced Features</a></li>
</ol>

<h2 id="account-setup">1. Account Setup</h2>
<p>First things first - let's get your account properly configured:</p>

<h3>Basic Information</h3>
<ul>
<li>Company name and details</li>
<li>Time zone configuration</li>
<li>Business hours setup</li>
</ul>

<table>
<thead>
<tr><th>Setting</th><th>Recommended Value</th><th>Notes</th></tr>
</thead>
<tbody>
<tr><td>Session timeout</td><td>30 minutes</td><td>Balances security and usability</td></tr>
<tr><td>Auto-save interval</td><td>30 seconds</td><td>Prevents data loss</td></tr>
<tr><td>Language</td><td>Auto-detect</td><td>Based on user browser</td></tr>
</tbody>
</table>

<h3>Configuration Example</h3>
<pre><code>{
  "company": {
    "name": "Acme Corp",
    "timezone": "America/New_York",
    "business_hours": {
      "start": "09:00",
      "end": "17:00",
      "days": ["monday", "tuesday", "wednesday", "thursday", "friday"]
    }
  }
}</code></pre>

<h2 id="team-management">2. Team Management</h2>
<p>Add your team members and configure their roles:</p>

<figure>
<img src="/images/team-setup.png" alt="Team management interface showing user roles and permissions" />
<figcaption>The team management interface allows you to control access and permissions</figcaption>
</figure>

<h3>User Roles</h3>
<dl>
<dt>Administrator</dt>
<dd>Full access to all features and settings. Can manage billing and users.</dd>

<dt>Agent</dt>
<dd>Can handle conversations, view reports, and manage their own settings.</dd>

<dt>Viewer</dt>
<dd>Read-only access to conversations and reports. Cannot respond to customers.</dd>
</dl>

<h2>Advanced Configuration</h2>
<details>
<summary>Click to expand advanced options</summary>
<p>These settings are for power users who need fine-grained control:</p>
<ul>
<li>Custom CSS for widget styling</li>
<li>Webhook configuration for external integrations</li>
<li>Advanced routing rules and automation</li>
</ul>
</details>

<hr>

<h2>Need Help?</h2>
<p>If you get stuck during setup, we're here to help:</p>
<ul>
<li>📧 Email: support@example.com</li>
<li>💬 Live chat: Available 24/7</li>
<li>📱 Phone: +1-555-0123</li>
</ul>`,

		"markdown_import": `
<h1>API Documentation</h1>
<p>This documentation covers the REST API endpoints for our platform.</p>
<h2>Authentication</h2>
<p>All API requests require authentication using an API key:</p>
<pre><code>curl -H "Authorization: Bearer YOUR_API_KEY" https://api.example.com/v1/users</code></pre>
<h2>Rate Limiting</h2>
<p>API requests are limited to 1000 requests per hour per API key.</p>
<h3>Rate Limit Headers</h3>
<ul>
<li><code>X-RateLimit-Limit</code>: The rate limit ceiling for your API key</li>
<li><code>X-RateLimit-Remaining</code>: The number of requests left for the time window</li>
<li><code>X-RateLimit-Reset</code>: The UTC date/time when the rate limit resets</li>
</ul>
<h2>Error Handling</h2>
<p>The API returns standard HTTP status codes:</p>
<ul>
<li><code>200</code> - Success</li>
<li><code>400</code> - Bad Request</li>
<li><code>401</code> - Unauthorized</li>
<li><code>404</code> - Not Found</li>
<li><code>500</code> - Internal Server Error</li>
</ul>
<h3>Error Response Format</h3>
<pre><code>{
  "error": {
    "code": "VALIDATION_ERROR",
    "message": "The email field is required.",
    "details": {
      "field": "email",
      "code": "required"
    }
  }
}</code></pre>
<hr>
<h2>Users Endpoint</h2>
<h3>List Users</h3>
<p><strong>GET</strong> <code>/v1/users</code></p>
<p>Returns a paginated list of users.</p>
<h4>Parameters</h4>
<ul>
<li><code>page</code> (integer, optional): Page number, defaults to 1</li>
<li><code>limit</code> (integer, optional): Items per page, defaults to 20, max 100</li>
<li><code>role</code> (string, optional): Filter by user role</li>
</ul>`,

		"interactive_transcript": `
<h1>Customer Onboarding Flow</h1>
<p>This interactive guide walks you through our customer onboarding process:</p>

<div data-type="callout" class="info">
<h3>Before You Start</h3>
<p>Make sure you have admin access to customize the onboarding flow.</p>
</div>

<h2>Step 1: Welcome Message</h2>
<p>Configure the first message customers see when they sign up:</p>

<div data-type="code-block" class="editable">
<pre><code>Welcome to [Company Name]!
We're excited to have you on board.
Let's get you set up in just a few minutes.</code></pre>
</div>

<div data-type="callout" class="warning">
<h4>Important Note</h4>
<p>Keep welcome messages short and friendly. Long text can overwhelm new users.</p>
</div>

<h2>Step 2: Data Collection</h2>
<p>Gather essential information from new customers:</p>

<div data-type="form-builder" class="interactive">
<h4>Required Fields:</h4>
<ul>
<li>Company name</li>
<li>Industry</li>
<li>Team size</li>
<li>Primary use case</li>
</ul>
</div>

<div data-type="tip" class="helpful">
<p><strong>Best Practice:</strong> Only ask for information you'll actually use. Each additional field reduces completion rates.</p>
</div>

<h2>Step 3: Feature Introduction</h2>
<p>Introduce key features through guided tours:</p>

<div data-type="checklist" class="interactive">
<h4>Tour Stops:</h4>
<ul>
<li>Dashboard overview</li>
<li>Creating first conversation</li>
<li>Setting up team members</li>
<li>Configuring notifications</li>
</ul>
</div>

<div data-type="callout" class="success">
<h4>Pro Tip</h4>
<p>Allow users to skip tours and return to them later. Not everyone learns the same way!</p>
</div>`,

		"giant_code_block": `
<h1>Complete Configuration File</h1>
<p>Below is the complete configuration file for our application. Copy this to your <code>config.toml</code> file:</p>

<pre><code># LibreDesk Configuration File
# This file contains all configuration options for the application

[app]
name = "LibreDesk"
version = "0.9.0"
environment = "production"
debug = false
log_level = "info"

[server]
host = "0.0.0.0"
port = 8080
read_timeout = "30s"
write_timeout = "30s"
idle_timeout = "120s"
max_header_bytes = 1048576

[database]
driver = "postgres"
host = "localhost"
port = 5432
name = "libredesk"
user = "postgres"
password = "your_password_here"
sslmode = "disable"
max_open_connections = 25
max_idle_connections = 5
connection_max_lifetime = "1h"

[redis]
host = "localhost"
port = 6379
password = ""
database = 0
max_retries = 3
pool_size = 10

[email]
driver = "smtp"
host = "smtp.gmail.com"
port = 587
username = "your_email@gmail.com"
password = "your_app_password"
from_address = "noreply@yourcompany.com"
from_name = "Your Company Support"

[storage]
driver = "local"
local_path = "./uploads"
max_file_size = "10MB"
allowed_extensions = ["jpg", "jpeg", "png", "gif", "pdf", "doc", "docx"]

[jwt]
secret = "your_super_secret_jwt_key_here"
expiry = "24h"
refresh_expiry = "168h"

[webhook]
queue_size = 1000
concurrency = 5
timeout = "10s"
retry_attempts = 3
retry_delay = "1s"

[ai]
provider = "openai"
api_key = "your_openai_api_key"
model = "gpt-4"
max_tokens = 1000
temperature = 0.7
system_prompt = "You are a helpful customer support assistant."

[embedding]
provider = "openai"
model = "text-embedding-ada-002"
dimensions = 1536
batch_size = 100

[search]
engine = "postgresql"
min_score = 0.5
max_results = 10
boost_title = 2.0
boost_content = 1.0

[monitoring]
enabled = true
metrics_endpoint = "/metrics"
health_endpoint = "/health"
profiler_enabled = false

[rate_limiting]
enabled = true
requests_per_minute = 60
burst_size = 100
cleanup_interval = "1m"

[cors]
allowed_origins = ["http://localhost:3000", "https://yourcompany.com"]
allowed_methods = ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
allowed_headers = ["Content-Type", "Authorization", "X-Requested-With"]
exposed_headers = ["X-Total-Count"]
allow_credentials = true
max_age = "12h"

[security]
bcrypt_cost = 12
session_timeout = "30m"
max_login_attempts = 5
lockout_duration = "15m"
require_https = true
csrf_protection = true

[notifications]
email_enabled = true
webhook_enabled = true
slack_enabled = false
discord_enabled = false

[limits]
max_conversations_per_contact = 1000
max_messages_per_conversation = 10000
max_attachments_per_message = 5
max_tags_per_conversation = 10
max_custom_attributes = 50</code></pre>

<p>After updating your configuration file, restart the application to apply the changes:</p>

<pre><code>sudo systemctl restart libredesk</code></pre>`,
	}
}

func TestRealWorldScenarios(t *testing.T) {
	samples := getRealWorldHTMLSamples()
	config := newTestConfig(150, 50, 15) // Use smaller limits to trigger chunking with word-based tokenizer

	testCases := []struct {
		name               string
		htmlKey            string
		expectedMinChunks  int
		expectedMaxChunks  int
		validationCallback func(*testing.T, []HTMLChunk, string)
	}{
		{
			name:              "API Reference Manual",
			htmlKey:           "api_reference",
			expectedMinChunks: 1,
			expectedMaxChunks: 8,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should have code blocks and tables properly chunked
				hasCodeChunk := false
				hasTableChunk := false
				for _, chunk := range chunks {
					if chunk.HasCode {
						hasCodeChunk = true
					}
					if chunk.HasTable {
						hasTableChunk = true
					}
				}
				assert.True(t, hasCodeChunk, "API reference should have at least one code chunk")
				assert.True(t, hasTableChunk, "API reference should have at least one table chunk")
			},
		},
		{
			name:              "Troubleshooting Guide",
			htmlKey:           "troubleshooting_guide",
			expectedMinChunks: 2,
			expectedMaxChunks: 6,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should chunk well with nested lists and headings
				assert.True(t, len(chunks) >= 2, "Troubleshooting guide should split into multiple logical sections")
				// Check that token distribution is reasonable
				for i, chunk := range chunks {
					tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
					assert.True(t, tokens > 0, "Chunk %d should have content", i)
				}
			},
		},
		{
			name:              "WYSIWYG Nightmare",
			htmlKey:           "wysiwyg_nightmare",
			expectedMinChunks: 1,
			expectedMaxChunks: 4,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle messy HTML gracefully
				assert.True(t, len(chunks) >= 1, "WYSIWYG content should create at least one chunk")
				// Verify no chunks are empty after cleaning
				for i, chunk := range chunks {
					cleanText := HTML2Text(chunk.OriginalHTML)
					assert.NotEmpty(t, strings.TrimSpace(cleanText), "Chunk %d should not be empty after HTML cleanup", i)
				}
			},
		},
		{
			name:              "Legal Wall of Text",
			htmlKey:           "legal_wall_text",
			expectedMinChunks: 1,
			expectedMaxChunks: 3,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle very long paragraphs by splitting appropriately
				if len(chunks) > 1 {
					for i, chunk := range chunks {
						tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
						// No chunk should be excessively large (allow some tolerance)
						assert.True(t, tokens <= config.MaxTokens*2, "Chunk %d should not be excessively large (%d tokens)", i, tokens)
					}
				}
			},
		},
		{
			name:              "Feature Comparison Table",
			htmlKey:           "feature_comparison_table",
			expectedMinChunks: 1,
			expectedMaxChunks: 4,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should preserve table structure
				hasTable := false
				for _, chunk := range chunks {
					if chunk.HasTable {
						hasTable = true
						// Table chunk should contain table structure
						assert.Contains(t, chunk.OriginalHTML, "<table", "Table chunk should contain table tags")
					}
				}
				assert.True(t, hasTable, "Feature comparison should have table content")
			},
		},
		{
			name:              "Poorly Structured HTML",
			htmlKey:           "poorly_structured_html",
			expectedMinChunks: 1,
			expectedMaxChunks: 6,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle malformed HTML structure gracefully
				assert.True(t, len(chunks) >= 1, "Should handle poorly structured HTML")
				// Verify chunker doesn't break on weird nesting
				for i, chunk := range chunks {
					assert.NotEmpty(t, chunk.OriginalHTML, "Chunk %d should have content", i)
				}
			},
		},
		{
			name:              "Minimalist Haiku",
			htmlKey:           "minimalist_haiku",
			expectedMinChunks: 1,
			expectedMaxChunks: 3,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should aggregate small sections appropriately
				totalTokens := 0
				for _, chunk := range chunks {
					tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
					totalTokens += tokens
				}
				// With many small sections, chunker should merge appropriately
				assert.True(t, totalTokens > 0, "Should have some content")
			},
		},
		{
			name:              "Release Notes",
			htmlKey:           "release_notes",
			expectedMinChunks: 2,
			expectedMaxChunks: 6,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should chunk by version sections
				hasHeadings := false
				for _, chunk := range chunks {
					if chunk.HasHeading {
						hasHeadings = true
					}
				}
				assert.True(t, hasHeadings, "Release notes should maintain heading structure")
			},
		},
		{
			name:              "Image Heavy Guide",
			htmlKey:           "image_heavy_guide",
			expectedMinChunks: 2,
			expectedMaxChunks: 8,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should chunk around steps with images
				stepCount := 0
				for _, chunk := range chunks {
					if strings.Contains(chunk.OriginalHTML, "<h2>Step") {
						stepCount++
					}
				}
				assert.True(t, stepCount >= 1, "Should preserve step-based structure")
			},
		},
		{
			name:              "Nested Lists",
			htmlKey:           "nested_lists",
			expectedMinChunks: 2,
			expectedMaxChunks: 6,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle deeply nested lists without breaking hierarchy
				for i, chunk := range chunks {
					// Check that nested content makes sense
					assert.NotEmpty(t, HTML2Text(chunk.OriginalHTML), "Chunk %d should have meaningful content", i)
				}
			},
		},
		{
			name:              "FAQ Description Lists",
			htmlKey:           "faq_description_lists",
			expectedMinChunks: 2,
			expectedMaxChunks: 6,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle dl/dt/dd structure appropriately
				hasDescriptionList := false
				for _, chunk := range chunks {
					if strings.Contains(chunk.OriginalHTML, "<dl>") {
						hasDescriptionList = true
					}
				}
				assert.True(t, hasDescriptionList, "Should preserve description list structure")
			},
		},
		{
			name:              "Kitchen Sink",
			htmlKey:           "kitchen_sink",
			expectedMinChunks: 3,
			expectedMaxChunks: 10,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle all element types
				hasHeading := false
				hasCode := false
				hasTable := false
				for _, chunk := range chunks {
					if chunk.HasHeading {
						hasHeading = true
					}
					if chunk.HasCode {
						hasCode = true
					}
					if chunk.HasTable {
						hasTable = true
					}
				}
				assert.True(t, hasHeading, "Kitchen sink should have headings")
				assert.True(t, hasCode, "Kitchen sink should have code")
				assert.True(t, hasTable, "Kitchen sink should have tables")
			},
		},
		{
			name:              "Markdown Import",
			htmlKey:           "markdown_import",
			expectedMinChunks: 2,
			expectedMaxChunks: 8,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle clean markdown-generated HTML
				hasCode := false
				for _, chunk := range chunks {
					if chunk.HasCode {
						hasCode = true
					}
				}
				assert.True(t, hasCode, "Markdown import should preserve code blocks")
			},
		},
		{
			name:              "Interactive Transcript",
			htmlKey:           "interactive_transcript",
			expectedMinChunks: 2,
			expectedMaxChunks: 6,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should handle custom div elements with data attributes
				hasCustomDivs := false
				for _, chunk := range chunks {
					if strings.Contains(chunk.OriginalHTML, "data-type=") {
						hasCustomDivs = true
					}
				}
				assert.True(t, hasCustomDivs, "Should preserve custom interactive elements")
			},
		},
		{
			name:              "Giant Code Block",
			htmlKey:           "giant_code_block",
			expectedMinChunks: 1,
			expectedMaxChunks: 3,
			validationCallback: func(t *testing.T, chunks []HTMLChunk, scenario string) {
				// Should truncate oversized code blocks to respect max tokens
				hasCodeBlocks := false
				for _, chunk := range chunks {
					if chunk.HasCode {
						hasCodeBlocks = true
						tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
						// Each chunk should respect max token limit (150 in test config)
						assert.LessOrEqual(t, tokens, 150, "Code block chunk should not exceed max tokens after truncation")
						t.Logf("Code block chunk has %d tokens", tokens)
					}
				}
				assert.True(t, hasCodeBlocks, "Should have code blocks")
			},
		},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			html, exists := samples[tc.htmlKey]
			require.True(t, exists, "Test HTML sample %s should exist", tc.htmlKey)

			chunks, err := ChunkHTMLContent(tc.name, html, config)
			require.NoError(t, err, "Chunking should not fail for %s", tc.name)

			// Basic validation
			assert.GreaterOrEqual(t, len(chunks), tc.expectedMinChunks,
				"Should have at least %d chunks for %s", tc.expectedMinChunks, tc.name)
			assert.LessOrEqual(t, len(chunks), tc.expectedMaxChunks,
				"Should have at most %d chunks for %s", tc.expectedMaxChunks, tc.name)

			// Verify chunk metadata
			for i, chunk := range chunks {
				assert.Equal(t, i, chunk.ChunkIndex, "Chunk index should be correct")
				assert.Equal(t, len(chunks), chunk.TotalChunks, "Total chunks should be correct")
				assert.NotEmpty(t, chunk.Text, "Chunk text should not be empty")
				assert.Contains(t, chunk.Text, tc.name, "Chunk should contain title")
			}

			// Token distribution validation
			totalTokens := 0
			for i, chunk := range chunks {
				tokens := simpleTokenizer(HTML2Text(chunk.OriginalHTML))
				totalTokens += tokens

				// Log token distribution for analysis
				t.Logf("Chunk %d: %d tokens", i, tokens)
			}

			// Scenario-specific validation
			if tc.validationCallback != nil {
				tc.validationCallback(t, chunks, tc.name)
			}

			t.Logf("Scenario '%s': %d chunks, %d total tokens", tc.name, len(chunks), totalTokens)
		})
	}
}

// TestChunkHTMLContent_ConfigurableTokenLimits tests that custom token limits work correctly
func TestChunkHTMLContent_ConfigurableTokenLimits(t *testing.T) {
	// Large HTML content with table and code
	largeHTML := `
		<h1>API Documentation</h1>
		<p>This is a comprehensive guide to our API endpoints with detailed examples.</p>
		<table>
			<tr><th>Endpoint</th><th>Method</th><th>Description</th><th>Parameters</th><th>Response</th></tr>
			<tr><td>/api/users</td><td>GET</td><td>Get all users</td><td>page, limit</td><td>JSON array of users</td></tr>
			<tr><td>/api/users/{id}</td><td>GET</td><td>Get user by ID</td><td>id (path)</td><td>JSON user object</td></tr>
			<tr><td>/api/users</td><td>POST</td><td>Create new user</td><td>name, email, role</td><td>Created user object</td></tr>
			<tr><td>/api/users/{id}</td><td>PUT</td><td>Update user</td><td>id (path), name, email, role</td><td>Updated user object</td></tr>
			<tr><td>/api/users/{id}</td><td>DELETE</td><td>Delete user</td><td>id (path)</td><td>Success message</td></tr>
		</table>
		<h2>Authentication</h2>
		<p>All API endpoints require authentication using JWT tokens in the Authorization header.</p>
		<pre><code>curl -H "Authorization: Bearer YOUR_TOKEN" -X GET https://api.example.com/users</code></pre>
		<p>Additional content to make this chunk larger and test the token limits effectively.</p>
	`

	// Test with default config (smaller chunks)
	defaultChunks, err := ChunkHTMLContent("API Guide", largeHTML)
	require.NoError(t, err)

	// Test with larger token config (should create fewer, larger chunks)
	largeConfig := ChunkConfig{
		MaxTokens:      2000, // Much larger than default 700
		MinTokens:      400,  // Larger than default 200
		OverlapTokens:  150,  // Larger than default 75
		TokenizerFunc:  simpleTokenizer,
		PreserveBlocks: []string{"pre", "code", "table"},
	}
	largeChunks, err := ChunkHTMLContent("API Guide", largeHTML, largeConfig)
	require.NoError(t, err)

	// Verify that larger config creates fewer chunks
	assert.True(t, len(largeChunks) <= len(defaultChunks),
		"Larger token config should create fewer or equal chunks. Default: %d, Large: %d",
		len(defaultChunks), len(largeChunks))

	// Verify that chunks contain expected metadata
	for _, chunk := range largeChunks {
		tokens, ok := chunk.Metadata["tokens"].(int)
		assert.True(t, ok, "Chunk should have token count in metadata")
		assert.True(t, tokens > 0, "Token count should be positive")

		// Check that we're respecting the larger config
		if tokens > 700 { // Old default limit
			t.Logf("✅ Large chunk with %d tokens (exceeds old 700 limit)", tokens)
		}
	}

	t.Logf("Default config: %d chunks, Large config: %d chunks", len(defaultChunks), len(largeChunks))
}