From df6f30824ad719a511ef4ae2e7f12c9e0b9ca6f1 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 14 Jan 2026 10:26:17 +1300
Subject: [PATCH 01/21] Improve exception handling across routes: log detailed
 errors server-side using `logger.exception`; return user-friendly error
 messages.

---
 webapp/routes/batch_routes.py              |  9 +++---
 webapp/routes/character_override_routes.py |  6 ++--
 webapp/routes/generation_routes.py         | 21 ++++++++++---
 webapp/routes/job_routes.py                | 10 +++++--
 webapp/routes/presets_routes.py            | 21 ++++++++-----
 webapp/routes/style_routes.py              | 34 ++++++++++++----------
 6 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/webapp/routes/batch_routes.py b/webapp/routes/batch_routes.py
index 3489ebd..f9929c8 100644
--- a/webapp/routes/batch_routes.py
+++ b/webapp/routes/batch_routes.py
@@ -9,7 +9,7 @@
 import json
 import uuid
 from typing import List, Tuple, Dict, Any
-from flask import Blueprint, jsonify, request, send_file, Response, stream_with_context
+from flask import Blueprint, jsonify, request, send_file, Response, stream_with_context, current_app
 from flask_login import login_required
 from werkzeug.utils import secure_filename
 
@@ -687,11 +687,10 @@ def batch_stream():
         else:
             return jsonify({"error": "File must be CSV or XLSX format"}), 400
 
-        print(f"DEBUG: File parsed successfully. Rows: {len(df)}, Columns: {list(df.columns)}")
+        current_app.logger.debug(f"File parsed successfully. Rows: {len(df)}, Columns: {list(df.columns)}")
     except Exception as e:
-        error_msg = f"Failed to read file: {e}"
-        print(f"ERROR: {error_msg}")
-        return jsonify({"error": error_msg}), 400
+        current_app.logger.exception('Failed to read uploaded batch file')
+        return jsonify({"error": "Failed to read file. Please ensure it's a valid CSV or XLSX format."}), 400
 
     # Get defaults from form, but filter out None/empty values
     defaults = {k: v for k, v in request.form.to_dict(flat=True).items() if v}
diff --git a/webapp/routes/character_override_routes.py b/webapp/routes/character_override_routes.py
index 3a811ab..e0a8c61 100644
--- a/webapp/routes/character_override_routes.py
+++ b/webapp/routes/character_override_routes.py
@@ -1,7 +1,7 @@
 """
 Admin routes for managing character override collections.
 """
-from flask import Blueprint, render_template, redirect, url_for, flash, request, jsonify
+from flask import Blueprint, render_template, redirect, url_for, flash, request, jsonify, current_app
 from flask_login import login_required, current_user
 from webapp.models import db, CharacterOverrideCollection, CharacterOverride
 from webapp.utils.auth_utils import admin_required, log_activity
@@ -454,8 +454,8 @@ def save_drawn_character(collection_id):
         return jsonify({'success': True, 'message': f'Character "{character}" saved successfully.'}), 200
 
     except Exception as e:
-        print(f"Error saving drawn character: {e}")
-        return jsonify({'error': str(e)}), 500
+        current_app.logger.exception('Error saving drawn character')
+        return jsonify({'error': 'Failed to save character'}), 500
 
 
 @character_override_bp.route('/character/<int:override_id>/delete', methods=['POST'])
diff --git a/webapp/routes/generation_routes.py b/webapp/routes/generation_routes.py
index fa581b2..86500b9 100644
--- a/webapp/routes/generation_routes.py
+++ b/webapp/routes/generation_routes.py
@@ -6,7 +6,7 @@
 import tempfile
 import time
 from typing import Any, Dict
-from flask import Blueprint, jsonify, request, Response
+from flask import Blueprint, jsonify, request, Response, current_app
 from flask_login import login_required
 
 # Ensure project root is in sys.path
@@ -107,8 +107,13 @@ def api_v1_generate():
         meta['generation_time_seconds'] = round(processing_time, 3)
 
         return jsonify({"svg": svg_text, "meta": meta})
-    except Exception as e:
+    except ValueError as e:
+        # ValueError is typically a validation error (invalid params), safe to show
+        current_app.logger.warning(f'Generation validation error: {e}')
         return jsonify({"error": str(e)}), 400
+    except Exception as e:
+        current_app.logger.exception('Generation error')
+        return jsonify({"error": "Failed to generate handwriting. Please check your parameters."}), 400
 
 
 @generation_bp.route("/api/v1/generate/svg", methods=["POST"])
@@ -146,8 +151,12 @@ def api_v1_generate_svg():
         log_activity('generate', f'Generated {lines_count} lines (SVG only)')
 
         return Response(svg_text, mimetype="image/svg+xml")
-    except Exception as e:
+    except ValueError as e:
+        current_app.logger.warning(f'Generation validation error: {e}')
         return jsonify({"error": str(e)}), 400
+    except Exception as e:
+        current_app.logger.exception('Generation error (SVG)')
+        return jsonify({"error": "Failed to generate handwriting. Please check your parameters."}), 400
 
 
 @generation_bp.route("/api/generate", methods=["POST"])
@@ -184,5 +193,9 @@ def generate_svg():
         log_activity('generate', f'Generated {lines_count} lines (legacy)')
 
         return Response(svg_text, mimetype="image/svg+xml")
-    except Exception as e:
+    except ValueError as e:
+        current_app.logger.warning(f'Generation validation error (legacy): {e}')
         return jsonify({"error": str(e)}), 400
+    except Exception as e:
+        current_app.logger.exception('Generation error (legacy)')
+        return jsonify({"error": "Failed to generate handwriting. Please check your parameters."}), 400
diff --git a/webapp/routes/job_routes.py b/webapp/routes/job_routes.py
index 99a0e34..3f6eccf 100644
--- a/webapp/routes/job_routes.py
+++ b/webapp/routes/job_routes.py
@@ -115,8 +115,10 @@ def list_jobs():
             }
         })
     except Exception as e:
-        current_app.logger.error(f'Error loading jobs: {str(e)}')
-        return jsonify({'error': 'Failed to load jobs', 'details': str(e)}), 500
+        # Log full exception details server-side for debugging
+        current_app.logger.exception('Error loading jobs')
+        # Return generic error message to user without internal details
+        return jsonify({'error': 'Failed to load jobs'}), 500
 
 
 @jobs_bp.route('/api/jobs', methods=['POST'])
@@ -401,5 +403,7 @@ def job_stats():
 
         return jsonify(stats)
     except Exception as e:
-        current_app.logger.error(f'Error loading job stats: {str(e)}')
+        # Log full exception details server-side (including stack trace)
+        current_app.logger.exception('Error loading job stats')
+        # Return generic error with default stats to keep UI functional
         return jsonify({'error': 'Failed to load stats', 'pending': 0, 'queued': 0, 'processing': 0, 'completed': 0, 'failed': 0, 'cancelled': 0, 'total': 0}), 500
diff --git a/webapp/routes/presets_routes.py b/webapp/routes/presets_routes.py
index acf228e..fcf0acd 100644
--- a/webapp/routes/presets_routes.py
+++ b/webapp/routes/presets_routes.py
@@ -1,7 +1,7 @@
 """
 API endpoints for page size and template presets.
 """
-from flask import Blueprint, jsonify, request
+from flask import Blueprint, jsonify, request, current_app
 from flask_login import login_required, current_user
 from webapp.models import PageSizePreset, TemplatePreset, db
 from webapp.utils.auth_utils import admin_required, log_activity
@@ -29,7 +29,8 @@ def list_page_sizes():
             'page_sizes': [ps.to_dict() for ps in page_sizes]
         })
     except Exception as e:
-        return jsonify({'page_sizes': [], 'error': str(e)}), 500
+        current_app.logger.exception('Error loading page sizes')
+        return jsonify({'page_sizes': [], 'error': 'Failed to load page sizes'}), 500
 
 
 @presets_bp.route('/api/templates', methods=['GET'])
@@ -50,7 +51,8 @@ def list_templates():
             'templates': [t.to_dict() for t in templates]
         })
     except Exception as e:
-        return jsonify({'templates': [], 'error': str(e)}), 500
+        current_app.logger.exception('Error loading templates')
+        return jsonify({'templates': [], 'error': 'Failed to load templates'}), 500
 
 
 @presets_bp.route('/api/templates/<int:template_id>', methods=['GET'])
@@ -72,7 +74,8 @@ def get_template(template_id):
             'template': template.to_dict()
         })
     except Exception as e:
-        return jsonify({'error': str(e)}), 404
+        current_app.logger.exception(f'Error loading template {template_id}')
+        return jsonify({'error': 'Template not found or error loading template'}), 404
 
 
 @presets_bp.route('/api/templates/<int:template_id>', methods=['PATCH'])
@@ -123,7 +126,8 @@ def update_template_status(template_id):
 
     except Exception as e:
         db.session.rollback()
-        return jsonify({'error': str(e)}), 500
+        current_app.logger.exception(f'Error updating template {template_id}')
+        return jsonify({'error': 'Failed to update template'}), 500
 
 
 @presets_bp.route('/api/templates', methods=['POST'])
@@ -223,7 +227,10 @@ def create_template_from_form():
         }), 201
 
     except ValueError as e:
-        return jsonify({'error': f'Invalid value: {str(e)}'}), 400
+        # ValueError is a controlled validation error, safe to show message
+        current_app.logger.warning(f'Invalid value when creating template: {e}')
+        return jsonify({'error': 'Invalid value provided'}), 400
     except Exception as e:
         db.session.rollback()
-        return jsonify({'error': str(e)}), 500
+        current_app.logger.exception('Error creating template')
+        return jsonify({'error': 'Failed to create template'}), 500
diff --git a/webapp/routes/style_routes.py b/webapp/routes/style_routes.py
index 9c8a9fb..cb5c31b 100644
--- a/webapp/routes/style_routes.py
+++ b/webapp/routes/style_routes.py
@@ -4,7 +4,7 @@
 import sys
 import re
 from typing import List, Dict, Any
-from flask import Blueprint, jsonify, send_file, Response
+from flask import Blueprint, jsonify, send_from_directory, Response, current_app
 from flask_login import login_required
 import numpy as np
 
@@ -120,7 +120,8 @@ def list_styles():
         return jsonify({"styles": styles})
 
     except Exception as e:
-        return jsonify({"styles": [], "error": str(e)}), 200
+        current_app.logger.exception('Error loading styles')
+        return jsonify({"styles": [], "error": "Failed to load styles"}), 200
 
 
 @style_bp.route("/api/style-preview/<int:style_id>", methods=["GET"])
@@ -136,26 +137,29 @@ def get_style_preview(style_id: int):
         SVG file content with 'image/svg+xml' mimetype, or a placeholder/error SVG if not found.
     """
     try:
-        # Validate style_id is a positive integer (Flask already validates it's an int)
+        # Validate style_id range (Flask already validates it's an int via <int:> route)
         if style_id < 0 or style_id > 999999:
             return Response(_placeholder_svg(style_id), mimetype='image/svg+xml')
 
-        # Construct safe filename - only digits allowed in style_id due to <int:> route
-        safe_filename = f"style-{style_id}.svg"
+        # Construct safe filename - style_id is guaranteed to be an integer by Flask route
+        # Using string formatting with validated integer prevents path traversal
+        safe_filename = f"style-{style_id:d}.svg"
 
-        # Build and normalize paths
-        base_path = os.path.normpath(os.path.abspath(STYLE_DIR))
-        file_path = os.path.normpath(os.path.join(base_path, safe_filename))
+        # Get the absolute base directory (constant, not user-controlled)
+        base_directory = os.path.abspath(STYLE_DIR)
 
-        # Verify path stays within base directory (defense in depth)
-        if not file_path.startswith(base_path + os.sep) and file_path != base_path:
+        # Check if the file exists before attempting to serve
+        file_path = os.path.join(base_directory, safe_filename)
+        if not os.path.isfile(file_path):
             return Response(_placeholder_svg(style_id), mimetype='image/svg+xml')
 
-        if os.path.isfile(file_path):
-            return send_file(file_path, mimetype='image/svg+xml')
-
-        # If no preview exists, return a placeholder SVG
-        return Response(_placeholder_svg(style_id), mimetype='image/svg+xml')
+        # Use send_from_directory for secure file serving
+        # This is Flask's safe way to serve files from a directory
+        return send_from_directory(
+            base_directory,
+            safe_filename,
+            mimetype='image/svg+xml'
+        )
 
     except Exception:
         # Return error placeholder

From 83db32c95793ef708f966b59d18749503a0e8905 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 14 Jan 2026 10:26:47 +1300
Subject: [PATCH 02/21] Add error logs management: view, search, filter,
 download, and clear logs from admin panel

---
 webapp/routes/admin_routes.py    | 207 ++++++++++++++++++++++++++++++-
 webapp/templates/admin/base.html |   6 +
 2 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/webapp/routes/admin_routes.py b/webapp/routes/admin_routes.py
index 4663033..a897353 100644
--- a/webapp/routes/admin_routes.py
+++ b/webapp/routes/admin_routes.py
@@ -1,7 +1,10 @@
 """
 Admin routes for user management and statistics.
 """
-from flask import Blueprint, render_template, redirect, url_for, flash, request, jsonify
+import os
+import re
+import glob
+from flask import Blueprint, render_template, redirect, url_for, flash, request, jsonify, current_app
 from flask_login import login_required, current_user
 from webapp.models import db, User, UserActivity, UsageStatistics, PageSizePreset, TemplatePreset
 from webapp.utils.auth_utils import admin_required, log_activity, get_user_statistics, get_user_activities, get_all_user_statistics
@@ -760,3 +763,205 @@ def delete_template(template_id):
     log_activity('admin_action', f'Deleted template preset: {name} (ID: {template_id})')
     flash(f'Template preset "{name}" deleted successfully.', 'success')
     return redirect(url_for('admin.templates'))
+
+
+# ============================================================================
+# Error Logs Management
+# ============================================================================
+
+def strip_ansi_codes(text):
+    """Remove ANSI escape codes from text."""
+    ansi_pattern = re.compile(r'\x1b\[[0-9;]*m')
+    return ansi_pattern.sub('', text)
+
+
+def parse_log_line(line):
+    """Parse a log line and extract level and content."""
+    clean_line = strip_ansi_codes(line)
+
+    # Determine log level based on content
+    level = 'info'
+    if 'ERROR' in clean_line.upper() or '500' in clean_line:
+        level = 'error'
+    elif 'WARNING' in clean_line.upper() or '400' in clean_line or '404' in clean_line:
+        level = 'warning'
+    elif 'DEBUG' in clean_line.upper():
+        level = 'debug'
+
+    return {
+        'raw': line,
+        'clean': clean_line,
+        'level': level
+    }
+
+
+@admin_bp.route('/logs')
+@login_required
+@admin_required
+def error_logs():
+    """
+    View application error logs.
+
+    Displays log files with filtering and search capabilities.
+    """
+    # Get logs directory
+    logs_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs')
+
+    # Get list of available log files
+    log_files = []
+    if os.path.exists(logs_dir):
+        for filename in sorted(os.listdir(logs_dir), reverse=True):
+            if filename.endswith('.txt') or filename.endswith('.log'):
+                filepath = os.path.join(logs_dir, filename)
+                stat = os.stat(filepath)
+                log_files.append({
+                    'name': filename,
+                    'size': stat.st_size,
+                    'modified': datetime.fromtimestamp(stat.st_mtime),
+                    'size_human': f"{stat.st_size / 1024:.1f} KB" if stat.st_size < 1024 * 1024 else f"{stat.st_size / (1024 * 1024):.1f} MB"
+                })
+
+    # Get selected log file (default to most recent)
+    selected_file = request.args.get('file', '')
+    if not selected_file and log_files:
+        selected_file = log_files[0]['name']
+
+    # Get filter parameters
+    level_filter = request.args.get('level', 'all')
+    search_query = request.args.get('search', '').strip()
+    lines_limit = request.args.get('limit', 500, type=int)
+
+    # Read log file contents
+    log_entries = []
+    total_lines = 0
+    error_count = 0
+    warning_count = 0
+
+    if selected_file:
+        filepath = os.path.join(logs_dir, selected_file)
+        # Security check: ensure the file is within logs_dir
+        real_logs_dir = os.path.realpath(logs_dir)
+        real_filepath = os.path.realpath(filepath)
+        if not real_filepath.startswith(real_logs_dir):
+            flash('Invalid log file path.', 'error')
+            return redirect(url_for('admin.error_logs'))
+
+        if os.path.exists(filepath):
+            try:
+                with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
+                    lines = f.readlines()
+
+                total_lines = len(lines)
+
+                # Process lines in reverse order (newest first)
+                for line in reversed(lines):
+                    if not line.strip():
+                        continue
+
+                    entry = parse_log_line(line)
+
+                    # Update counts
+                    if entry['level'] == 'error':
+                        error_count += 1
+                    elif entry['level'] == 'warning':
+                        warning_count += 1
+
+                    # Apply filters
+                    if level_filter != 'all' and entry['level'] != level_filter:
+                        continue
+
+                    if search_query and search_query.lower() not in entry['clean'].lower():
+                        continue
+
+                    log_entries.append(entry)
+
+                    # Limit entries
+                    if len(log_entries) >= lines_limit:
+                        break
+
+            except Exception as e:
+                current_app.logger.exception(f'Error reading log file: {selected_file}')
+                flash(f'Error reading log file.', 'error')
+
+    log_activity('admin_action', f'Viewed error logs: {selected_file}')
+
+    return render_template('admin/logs.html',
+                           active_nav='logs',
+                           log_files=log_files,
+                           selected_file=selected_file,
+                           log_entries=log_entries,
+                           total_lines=total_lines,
+                           error_count=error_count,
+                           warning_count=warning_count,
+                           level_filter=level_filter,
+                           search_query=search_query,
+                           lines_limit=lines_limit)
+
+
+@admin_bp.route('/logs/download/<filename>')
+@login_required
+@admin_required
+def download_log(filename):
+    """
+    Download a log file.
+
+    Args:
+        filename: Name of the log file to download.
+    """
+    from flask import send_file
+
+    logs_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs')
+    filepath = os.path.join(logs_dir, filename)
+
+    # Security check: ensure the file is within logs_dir
+    real_logs_dir = os.path.realpath(logs_dir)
+    real_filepath = os.path.realpath(filepath)
+    if not real_filepath.startswith(real_logs_dir):
+        flash('Invalid log file path.', 'error')
+        return redirect(url_for('admin.error_logs'))
+
+    if not os.path.exists(filepath):
+        flash('Log file not found.', 'error')
+        return redirect(url_for('admin.error_logs'))
+
+    log_activity('admin_action', f'Downloaded log file: {filename}')
+
+    return send_file(filepath, as_attachment=True, download_name=filename)
+
+
+@admin_bp.route('/logs/clear/<filename>', methods=['POST'])
+@login_required
+@admin_required
+def clear_log(filename):
+    """
+    Clear (truncate) a log file.
+
+    Args:
+        filename: Name of the log file to clear.
+    """
+    logs_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs')
+    filepath = os.path.join(logs_dir, filename)
+
+    # Security check: ensure the file is within logs_dir
+    real_logs_dir = os.path.realpath(logs_dir)
+    real_filepath = os.path.realpath(filepath)
+    if not real_filepath.startswith(real_logs_dir):
+        flash('Invalid log file path.', 'error')
+        return redirect(url_for('admin.error_logs'))
+
+    if not os.path.exists(filepath):
+        flash('Log file not found.', 'error')
+        return redirect(url_for('admin.error_logs'))
+
+    try:
+        # Truncate the file
+        with open(filepath, 'w') as f:
+            f.write(f'# Log cleared by {current_user.username} at {datetime.now().isoformat()}\n')
+
+        log_activity('admin_action', f'Cleared log file: {filename}')
+        flash(f'Log file "{filename}" has been cleared.', 'success')
+    except Exception as e:
+        current_app.logger.exception(f'Error clearing log file: {filename}')
+        flash('Error clearing log file.', 'error')
+
+    return redirect(url_for('admin.error_logs', file=filename))
diff --git a/webapp/templates/admin/base.html b/webapp/templates/admin/base.html
index aa41079..0287fdf 100644
--- a/webapp/templates/admin/base.html
+++ b/webapp/templates/admin/base.html
@@ -66,6 +66,12 @@ <h1>{% block admin_title %}Admin Dashboard{% endblock %}</h1>
                     Templates
                 </a>
             </li>
+            <li class="admin-nav__item">
+                <a href="{{ url_for('admin.error_logs') }}"
+                   class="admin-nav__link{% if active_nav == 'logs' %} admin-nav__link--active{% endif %}">
+                    Error Logs
+                </a>
+            </li>
         </ul>
     </nav>
 

From 4223a3d32a970180e9c1e9c5c9b1d42683da0dd1 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 14 Jan 2026 10:26:53 +1300
Subject: [PATCH 03/21] Set `FLASK_APP` environment variable in all
 Flask-Migrate commands; add `check` command for verifying Flask-Migrate
 installation and setup

---
 .claude/settings.local.json |  3 ++-
 deploy/db-migrate.sh        | 33 +++++++++++++++++++++++++--------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index a57520a..233aa41 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -29,7 +29,8 @@
       "Bash(find:*)",
       "Skill(frontend-design)",
       "Bash(FLASK_DEBUG=1 python:*)",
-      "Bash(docker compose:*)"
+      "Bash(docker compose:*)",
+      "Bash(wc:*)"
     ],
     "deny": [],
     "ask": []
diff --git a/deploy/db-migrate.sh b/deploy/db-migrate.sh
index e16cd0f..dbbdae1 100644
--- a/deploy/db-migrate.sh
+++ b/deploy/db-migrate.sh
@@ -52,7 +52,7 @@ fi
 case $COMMAND in
     upgrade)
         log_info "Applying pending migrations..."
-        docker exec ${CONTAINER} flask db upgrade
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db upgrade
         log_info "Migrations applied successfully"
         ;;
 
@@ -60,7 +60,7 @@ case $COMMAND in
         log_warn "This will revert the last migration!"
         read -p "Are you sure? (y/N) " confirm
         if [ "$confirm" = "y" ] || [ "$confirm" = "Y" ]; then
-            docker exec ${CONTAINER} flask db downgrade
+            docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db downgrade
             log_info "Migration reverted"
         else
             log_info "Cancelled"
@@ -69,36 +69,52 @@ case $COMMAND in
 
     current)
         log_info "Current migration version:"
-        docker exec ${CONTAINER} flask db current
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db current
         ;;
 
     history)
         log_info "Migration history:"
-        docker exec ${CONTAINER} flask db history
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db history
         ;;
 
     migrate)
         MESSAGE="${2:-Auto-generated migration}"
         log_info "Generating new migration: ${MESSAGE}"
-        docker exec ${CONTAINER} flask db migrate -m "${MESSAGE}"
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db migrate -m "${MESSAGE}"
         log_warn "Review the generated migration before applying!"
         log_info "Apply with: $0 upgrade"
         ;;
 
     heads)
         log_info "Current head revisions:"
-        docker exec ${CONTAINER} flask db heads
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db heads
         ;;
 
     init)
         log_info "Initializing migrations directory..."
-        docker exec ${CONTAINER} flask db init
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db init
         ;;
 
     stamp)
         REVISION="${2:-head}"
         log_info "Stamping database with revision: ${REVISION}"
-        docker exec ${CONTAINER} flask db stamp ${REVISION}
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask db stamp ${REVISION}
+        ;;
+
+    check)
+        log_info "Checking Flask-Migrate installation..."
+        echo ""
+        echo "1. Checking if Flask-Migrate is installed:"
+        docker exec ${CONTAINER} pip show flask-migrate 2>/dev/null || log_error "Flask-Migrate NOT installed!"
+        echo ""
+        echo "2. Checking available Flask commands:"
+        docker exec -e FLASK_APP=webapp.app:app ${CONTAINER} flask --help 2>&1 | grep -E "(db|Commands)" || true
+        echo ""
+        echo "3. Checking FLASK_APP environment:"
+        docker exec ${CONTAINER} printenv | grep FLASK || log_warn "FLASK_APP not in container env"
+        echo ""
+        echo "4. Checking migrations directory:"
+        docker exec ${CONTAINER} ls -la /app/migrations 2>/dev/null || log_error "Migrations directory not found!"
         ;;
 
     *)
@@ -113,6 +129,7 @@ case $COMMAND in
         echo "  heads       Show current head revisions"
         echo "  init        Initialize migrations (first time only)"
         echo "  stamp       Mark database at specific revision"
+        echo "  check       Verify Flask-Migrate installation and setup"
         echo ""
         echo "Options:"
         echo "  --production  Use production container"

From b9e903074e3be1b635534655441df759e5cca026 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 14 Jan 2026 10:27:11 +1300
Subject: [PATCH 04/21] Add error logs template: implement UI for viewing,
 filtering, searching, downloading, and clearing admin logs

---
 webapp/templates/admin/logs.html | 488 +++++++++++++++++++++++++++++++
 1 file changed, 488 insertions(+)
 create mode 100644 webapp/templates/admin/logs.html

diff --git a/webapp/templates/admin/logs.html b/webapp/templates/admin/logs.html
new file mode 100644
index 0000000..b34def7
--- /dev/null
+++ b/webapp/templates/admin/logs.html
@@ -0,0 +1,488 @@
+{% extends "admin/base.html" %}
+
+{% set active_nav = 'logs' %}
+
+{% block title %}Error Logs - Admin - WriteBot{% endblock %}
+
+{% block admin_title %}Error Logs{% endblock %}
+
+{% block admin_extra_css %}
+<style>
+    .logs-controls {
+        display: flex;
+        gap: 1rem;
+        flex-wrap: wrap;
+        margin-bottom: 1.5rem;
+        align-items: flex-end;
+    }
+
+    .logs-control-group {
+        display: flex;
+        flex-direction: column;
+        gap: 0.25rem;
+    }
+
+    .logs-control-group label {
+        font-size: 0.75rem;
+        color: #525252;
+        font-weight: 500;
+    }
+
+    .logs-control-group select,
+    .logs-control-group input {
+        padding: 0.5rem 0.75rem;
+        border: 1px solid #8d8d8d;
+        border-radius: 0;
+        font-size: 0.875rem;
+        background: #fff;
+        min-width: 150px;
+    }
+
+    .logs-control-group select:focus,
+    .logs-control-group input:focus {
+        outline: 2px solid #0f62fe;
+        outline-offset: -2px;
+        border-color: #0f62fe;
+    }
+
+    .logs-stats {
+        display: flex;
+        gap: 1.5rem;
+        margin-bottom: 1rem;
+        padding: 1rem;
+        background: #f4f4f4;
+        border-left: 3px solid #0f62fe;
+    }
+
+    .logs-stat {
+        display: flex;
+        flex-direction: column;
+    }
+
+    .logs-stat__value {
+        font-size: 1.5rem;
+        font-weight: 600;
+        line-height: 1;
+    }
+
+    .logs-stat__label {
+        font-size: 0.75rem;
+        color: #525252;
+        margin-top: 0.25rem;
+    }
+
+    .logs-stat--error .logs-stat__value {
+        color: #da1e28;
+    }
+
+    .logs-stat--warning .logs-stat__value {
+        color: #f1c21b;
+    }
+
+    .logs-file-selector {
+        display: flex;
+        gap: 0.5rem;
+        flex-wrap: wrap;
+        margin-bottom: 1rem;
+    }
+
+    .logs-file-btn {
+        padding: 0.5rem 1rem;
+        border: 1px solid #e0e0e0;
+        background: #fff;
+        font-size: 0.875rem;
+        cursor: pointer;
+        transition: all 0.15s;
+    }
+
+    .logs-file-btn:hover {
+        background: #f4f4f4;
+        border-color: #8d8d8d;
+    }
+
+    .logs-file-btn--active {
+        background: #0f62fe;
+        color: #fff;
+        border-color: #0f62fe;
+    }
+
+    .logs-file-btn--active:hover {
+        background: #0353e9;
+    }
+
+    .logs-viewer {
+        background: #161616;
+        border-radius: 4px;
+        overflow: hidden;
+        max-height: 600px;
+        overflow-y: auto;
+    }
+
+    .logs-viewer__header {
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        padding: 0.75rem 1rem;
+        background: #262626;
+        border-bottom: 1px solid #393939;
+        position: sticky;
+        top: 0;
+        z-index: 10;
+    }
+
+    .logs-viewer__title {
+        color: #f4f4f4;
+        font-size: 0.875rem;
+        font-family: 'IBM Plex Mono', monospace;
+    }
+
+    .logs-viewer__actions {
+        display: flex;
+        gap: 0.5rem;
+    }
+
+    .logs-viewer__action-btn {
+        padding: 0.25rem 0.5rem;
+        background: #393939;
+        color: #f4f4f4;
+        border: none;
+        font-size: 0.75rem;
+        cursor: pointer;
+        border-radius: 2px;
+        text-decoration: none;
+        display: inline-flex;
+        align-items: center;
+        gap: 0.25rem;
+    }
+
+    .logs-viewer__action-btn:hover {
+        background: #525252;
+    }
+
+    .logs-viewer__action-btn--danger {
+        background: #da1e28;
+    }
+
+    .logs-viewer__action-btn--danger:hover {
+        background: #b81921;
+    }
+
+    .logs-content {
+        padding: 1rem;
+        font-family: 'IBM Plex Mono', monospace;
+        font-size: 0.8125rem;
+        line-height: 1.6;
+    }
+
+    .log-entry {
+        padding: 0.25rem 0;
+        border-bottom: 1px solid #262626;
+        color: #c6c6c6;
+        word-break: break-word;
+    }
+
+    .log-entry:last-child {
+        border-bottom: none;
+    }
+
+    .log-entry--error {
+        color: #ff8389;
+        background: rgba(218, 30, 40, 0.1);
+        padding: 0.5rem;
+        margin: 0.25rem 0;
+        border-left: 3px solid #da1e28;
+    }
+
+    .log-entry--warning {
+        color: #f1c21b;
+        background: rgba(241, 194, 27, 0.05);
+        padding: 0.5rem;
+        margin: 0.25rem 0;
+        border-left: 3px solid #f1c21b;
+    }
+
+    .log-entry--debug {
+        color: #6f6f6f;
+    }
+
+    .log-entry__timestamp {
+        color: #6f6f6f;
+    }
+
+    .empty-logs {
+        padding: 3rem;
+        text-align: center;
+        color: #6f6f6f;
+    }
+
+    .log-file-info {
+        display: flex;
+        align-items: center;
+        gap: 1rem;
+        padding: 0.75rem 1rem;
+        background: #262626;
+        margin-bottom: 1rem;
+        border-radius: 4px;
+    }
+
+    .log-file-info__name {
+        color: #f4f4f4;
+        font-weight: 500;
+        font-family: 'IBM Plex Mono', monospace;
+    }
+
+    .log-file-info__meta {
+        color: #8d8d8d;
+        font-size: 0.75rem;
+    }
+
+    .badge-level {
+        display: inline-block;
+        padding: 0.125rem 0.5rem;
+        font-size: 0.75rem;
+        font-weight: 500;
+        border-radius: 2px;
+    }
+
+    .badge-level--error {
+        background: #da1e28;
+        color: #fff;
+    }
+
+    .badge-level--warning {
+        background: #f1c21b;
+        color: #161616;
+    }
+
+    .badge-level--info {
+        background: #0f62fe;
+        color: #fff;
+    }
+
+    .badge-level--debug {
+        background: #6f6f6f;
+        color: #fff;
+    }
+
+    /* Confirm dialog */
+    .confirm-dialog {
+        display: none;
+        position: fixed;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background: rgba(0,0,0,0.5);
+        z-index: 1000;
+        justify-content: center;
+        align-items: center;
+    }
+
+    .confirm-dialog.active {
+        display: flex;
+    }
+
+    .confirm-dialog__content {
+        background: #fff;
+        padding: 1.5rem;
+        border-radius: 4px;
+        max-width: 400px;
+        box-shadow: 0 4px 20px rgba(0,0,0,0.2);
+    }
+
+    .confirm-dialog__title {
+        font-size: 1.125rem;
+        font-weight: 600;
+        margin-bottom: 0.5rem;
+    }
+
+    .confirm-dialog__message {
+        color: #525252;
+        margin-bottom: 1.5rem;
+    }
+
+    .confirm-dialog__actions {
+        display: flex;
+        gap: 0.5rem;
+        justify-content: flex-end;
+    }
+
+    @media (max-width: 768px) {
+        .logs-controls {
+            flex-direction: column;
+        }
+
+        .logs-control-group {
+            width: 100%;
+        }
+
+        .logs-control-group select,
+        .logs-control-group input {
+            width: 100%;
+        }
+
+        .logs-stats {
+            flex-wrap: wrap;
+        }
+    }
+</style>
+{% endblock %}
+
+{% block admin_content %}
+<div class="admin-section">
+    <div class="admin-section__header">
+        <div>
+            <h2 class="admin-section__title">Application Logs</h2>
+            <p class="admin-section__subtitle">View and manage application error and debug logs</p>
+        </div>
+    </div>
+
+    {% if log_files %}
+    <!-- Log File Selector -->
+    <div class="logs-file-selector">
+        {% for file in log_files[:10] %}
+        <a href="{{ url_for('admin.error_logs', file=file.name, level=level_filter, search=search_query) }}"
+           class="logs-file-btn{% if file.name == selected_file %} logs-file-btn--active{% endif %}">
+            {{ file.name | replace('log_', '') | replace('.txt', '') }}
+            <small>({{ file.size_human }})</small>
+        </a>
+        {% endfor %}
+    </div>
+
+    <!-- Stats Overview -->
+    <div class="logs-stats">
+        <div class="logs-stat">
+            <span class="logs-stat__value">{{ total_lines }}</span>
+            <span class="logs-stat__label">Total Lines</span>
+        </div>
+        <div class="logs-stat logs-stat--error">
+            <span class="logs-stat__value">{{ error_count }}</span>
+            <span class="logs-stat__label">Errors</span>
+        </div>
+        <div class="logs-stat logs-stat--warning">
+            <span class="logs-stat__value">{{ warning_count }}</span>
+            <span class="logs-stat__label">Warnings</span>
+        </div>
+        <div class="logs-stat">
+            <span class="logs-stat__value">{{ log_entries | length }}</span>
+            <span class="logs-stat__label">Showing</span>
+        </div>
+    </div>
+
+    <!-- Filter Controls -->
+    <form method="GET" action="{{ url_for('admin.error_logs') }}" class="logs-controls">
+        <input type="hidden" name="file" value="{{ selected_file }}">
+
+        <div class="logs-control-group">
+            <label for="level">Log Level</label>
+            <select name="level" id="level" onchange="this.form.submit()">
+                <option value="all" {% if level_filter == 'all' %}selected{% endif %}>All Levels</option>
+                <option value="error" {% if level_filter == 'error' %}selected{% endif %}>Errors Only</option>
+                <option value="warning" {% if level_filter == 'warning' %}selected{% endif %}>Warnings Only</option>
+                <option value="info" {% if level_filter == 'info' %}selected{% endif %}>Info Only</option>
+                <option value="debug" {% if level_filter == 'debug' %}selected{% endif %}>Debug Only</option>
+            </select>
+        </div>
+
+        <div class="logs-control-group">
+            <label for="search">Search</label>
+            <input type="text" name="search" id="search" value="{{ search_query }}" placeholder="Search logs...">
+        </div>
+
+        <div class="logs-control-group">
+            <label for="limit">Lines Limit</label>
+            <select name="limit" id="limit" onchange="this.form.submit()">
+                <option value="100" {% if lines_limit == 100 %}selected{% endif %}>100</option>
+                <option value="500" {% if lines_limit == 500 %}selected{% endif %}>500</option>
+                <option value="1000" {% if lines_limit == 1000 %}selected{% endif %}>1000</option>
+                <option value="5000" {% if lines_limit == 5000 %}selected{% endif %}>5000</option>
+            </select>
+        </div>
+
+        <div class="logs-control-group">
+            <label>&nbsp;</label>
+            <button type="submit" class="btn btn-primary" style="padding: 0.5rem 1rem;">Apply Filter</button>
+        </div>
+    </form>
+
+    <!-- Log Viewer -->
+    <div class="logs-viewer">
+        <div class="logs-viewer__header">
+            <span class="logs-viewer__title">{{ selected_file }}</span>
+            <div class="logs-viewer__actions">
+                <a href="{{ url_for('admin.download_log', filename=selected_file) }}" class="logs-viewer__action-btn">
+                    Download
+                </a>
+                <button type="button" class="logs-viewer__action-btn logs-viewer__action-btn--danger" onclick="showClearConfirm()">
+                    Clear Log
+                </button>
+            </div>
+        </div>
+
+        <div class="logs-content">
+            {% if log_entries %}
+                {% for entry in log_entries %}
+                <div class="log-entry log-entry--{{ entry.level }}">{{ entry.clean }}</div>
+                {% endfor %}
+            {% else %}
+            <div class="empty-logs">
+                {% if search_query or level_filter != 'all' %}
+                No log entries match your filter criteria.
+                {% else %}
+                No log entries found in this file.
+                {% endif %}
+            </div>
+            {% endif %}
+        </div>
+    </div>
+
+    {% else %}
+    <div class="empty-state">
+        <div class="empty-state__message">No log files found in the logs directory.</div>
+    </div>
+    {% endif %}
+</div>
+
+<!-- Clear Confirmation Dialog -->
+<div id="clearConfirmDialog" class="confirm-dialog">
+    <div class="confirm-dialog__content">
+        <div class="confirm-dialog__title">Clear Log File?</div>
+        <div class="confirm-dialog__message">
+            This will permanently delete all entries in <strong>{{ selected_file }}</strong>.
+            This action cannot be undone.
+        </div>
+        <div class="confirm-dialog__actions">
+            <button type="button" class="btn btn-ghost" onclick="hideClearConfirm()">Cancel</button>
+            <form method="POST" action="{{ url_for('admin.clear_log', filename=selected_file) }}" style="display: inline;">
+                <button type="submit" class="btn btn-danger">Clear Log</button>
+            </form>
+        </div>
+    </div>
+</div>
+{% endblock %}
+
+{% block admin_extra_js %}
+<script>
+    function showClearConfirm() {
+        document.getElementById('clearConfirmDialog').classList.add('active');
+    }
+
+    function hideClearConfirm() {
+        document.getElementById('clearConfirmDialog').classList.remove('active');
+    }
+
+    // Close dialog on outside click
+    document.getElementById('clearConfirmDialog').addEventListener('click', function(e) {
+        if (e.target === this) {
+            hideClearConfirm();
+        }
+    });
+
+    // Close dialog on Escape key
+    document.addEventListener('keydown', function(e) {
+        if (e.key === 'Escape') {
+            hideClearConfirm();
+        }
+    });
+</script>
+{% endblock %}

From a8a38b5c32f784bcd4caf62c31caac987eb1ddab Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Tue, 20 Jan 2026 15:02:52 +1300
Subject: [PATCH 05/21] Update stroke width UI in admin template and adjust SVG
 stroke handling for consistency with generated text

---
 handwriting_synthesis/hand/_draw.py                  | 12 ++++++------
 webapp/templates/admin/character_overrides/view.html |  9 ++++++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 28b91c0..1f7a0ae 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -553,19 +553,19 @@ def _draw(
                                 continue
 
                             orig_stroke = elem.get('stroke', 'none')
-                            orig_stroke_width = elem.get('stroke-width', '3')
 
                             path = dwg.path(d=d)
 
                             if orig_stroke and orig_stroke.lower() not in ('none', 'transparent'):
-                                try:
-                                    stroke_width = min(float(orig_stroke_width), 4.0)
-                                except:
-                                    stroke_width = 2.0
+                                # Use line-level stroke width for consistency with generated text
+                                # Compensate for transform scaling to maintain visual thickness
+                                line_stroke_width = segment['width']
+                                avg_scale = (scale_x + scale_y) / 2.0
+                                adjusted_stroke_width = line_stroke_width / avg_scale if avg_scale > 0 else line_stroke_width
 
                                 path = path.stroke(
                                     color=segment['color'],
-                                    width=stroke_width,
+                                    width=adjusted_stroke_width,
                                     linecap='round',
                                     linejoin='round'
                                 ).fill('none')
diff --git a/webapp/templates/admin/character_overrides/view.html b/webapp/templates/admin/character_overrides/view.html
index 264c119..b8c3631 100644
--- a/webapp/templates/admin/character_overrides/view.html
+++ b/webapp/templates/admin/character_overrides/view.html
@@ -75,9 +75,12 @@ <h3>Upload Character Variants</h3>
                     <label for="draw-baseline-offset">Baseline Offset</label>
                     <input type="number" id="draw-baseline-offset" value="0" step="0.1">
                 </div>
-                <div class="form-group" style="max-width: 140px;">
-                    <label for="draw-stroke-width">Stroke Width</label>
-                    <input type="number" id="draw-stroke-width" value="1" min="1" max="10" step="0.5">
+                <div class="form-group" style="max-width: 160px;">
+                    <label for="draw-stroke-width">Stroke Width (Preview)</label>
+                    <input type="number" id="draw-stroke-width" value="2" min="1" max="5" step="0.5">
+                    <small style="display: block; color: #6f6f6f; margin-top: 4px; font-size: 11px;">
+                        Preview only. Final width uses generation settings.
+                    </small>
                 </div>
                 <div class="button-group" style="margin-top: 24px;">
                     <button type="button" id="clear-canvas" class="btn btn-secondary">Clear</button>

From e6c9df87b197ce432cc776a92ecd2c422881907c Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Tue, 20 Jan 2026 15:41:51 +1300
Subject: [PATCH 06/21] Refine override width estimation: calculate target
 height dynamically and adjust scaling logic for consistency with generated
 text layout

---
 .claude/settings.local.json         |  3 ++-
 handwriting_synthesis/hand/Hand.py  | 11 +++++++++--
 handwriting_synthesis/hand/_draw.py | 12 +++++++-----
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 233aa41..f5cefa4 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -30,7 +30,8 @@
       "Skill(frontend-design)",
       "Bash(FLASK_DEBUG=1 python:*)",
       "Bash(docker compose:*)",
-      "Bash(wc:*)"
+      "Bash(wc:*)",
+      "Bash(grep:*)"
     ],
     "deny": [],
     "ask": []
diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 9f24134..1c2473b 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -471,14 +471,21 @@ def write_chunked(
                 current_line_width = 0.0
                 current_line_segment_list = []
 
+                # Calculate actual target height for override width estimation
+                # This must match the calculation in _draw.py for consistent layout
+                from handwriting_synthesis.hand._draw import _to_px
+                default_line_height_px = 60.0
+                line_height_px = _to_px(line_height, units) if line_height is not None else default_line_height_px
+                target_h_for_estimation = 0.95 * line_height_px
+
                 for seg_idx, segment in enumerate(line_segments_data):
                     if segment['type'] == 'override':
                         # Estimate override width for layout
                         from handwriting_synthesis.hand.character_override_utils import get_random_override, estimate_override_width
                         override_data = get_random_override(overrides_dict, segment['text'])
                         if override_data:
-                            # Estimate width (using typical line height of 60px)
-                            override_width = estimate_override_width(override_data, target_height=60, x_stretch=1.0)
+                            # Estimate width using actual target height and x_stretch
+                            override_width = estimate_override_width(override_data, target_height=target_h_for_estimation, x_stretch=x_stretch)
                         else:
                             override_width = 20  # fallback width
 
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 1f7a0ae..c8eb5bd 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -371,7 +371,8 @@ def _draw(
                     ls_temp[:, 0] *= x_stretch
                 total_line_width += ls_temp[:, 0].max()
             elif segment.get('type') == 'override':
-                override_width = segment['estimated_width']
+                # Apply s_global to match generated text scaling
+                override_width = segment['estimated_width'] * s_global
 
                 # Check if there's a space before this override character
                 has_space_before = False
@@ -496,12 +497,13 @@ def _draw(
 
                     # Calculate scale to match generated text height
                     # Generated text: normalized to start at y=0, height=raw_h, then scaled by s_global
-                    # Final height = raw_h * s_global ≈ target_h
-                    # SVG character should have same final height: char_height * scale = target_h
+                    # Final height = raw_h * s_global (which may be < target_h when width-constrained)
+                    # Override should match: char_height * scale = target_h * s_global
+                    effective_target_h = target_h * s_global
                     if char_height > 0:
-                        scale = target_h / char_height
+                        scale = effective_target_h / char_height
                     else:
-                        scale = 1.0
+                        scale = s_global
 
                     scale_x = scale * x_stretch
                     scale_y = scale

From 112028d6c8654b72170cd595092e538ec955a4e6 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Tue, 20 Jan 2026 17:40:35 +1300
Subject: [PATCH 07/21] Refine override width and scaling logic: dynamically
 compute effective target height, improve consistency with generated text, and
 add debug logs for preprocessing and overrides

---
 handwriting_synthesis/hand/Hand.py  | 11 ++-------
 handwriting_synthesis/hand/_draw.py | 36 +++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 1c2473b..9f24134 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -471,21 +471,14 @@ def write_chunked(
                 current_line_width = 0.0
                 current_line_segment_list = []
 
-                # Calculate actual target height for override width estimation
-                # This must match the calculation in _draw.py for consistent layout
-                from handwriting_synthesis.hand._draw import _to_px
-                default_line_height_px = 60.0
-                line_height_px = _to_px(line_height, units) if line_height is not None else default_line_height_px
-                target_h_for_estimation = 0.95 * line_height_px
-
                 for seg_idx, segment in enumerate(line_segments_data):
                     if segment['type'] == 'override':
                         # Estimate override width for layout
                         from handwriting_synthesis.hand.character_override_utils import get_random_override, estimate_override_width
                         override_data = get_random_override(overrides_dict, segment['text'])
                         if override_data:
-                            # Estimate width using actual target height and x_stretch
-                            override_width = estimate_override_width(override_data, target_height=target_h_for_estimation, x_stretch=x_stretch)
+                            # Estimate width (using typical line height of 60px)
+                            override_width = estimate_override_width(override_data, target_height=60, x_stretch=1.0)
                         else:
                             override_width = 20  # fallback width
 
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index c8eb5bd..eda8a17 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -219,6 +219,7 @@ def _draw(
     # First pass: preprocess each line and compute per-line max allowed scale
     preprocessed_lines = []
     scale_limits = []
+    raw_heights = []  # Track raw heights for computing average
     target_h = 0.95 * line_height_px
 
     for line_idx, segment_list in enumerate(line_segments):
@@ -279,6 +280,11 @@ def _draw(
                 s_w = content_width_px / raw_w
                 s_h = target_h / raw_h
                 scale_limits.append(min(s_w, s_h))
+                raw_heights.append(raw_h)  # Track for average calculation
+
+                # DEBUG: Log preprocessing values
+                print(f"DEBUG preprocess: text='{segment.get('text', '')[:20]}', raw_h={raw_h:.2f}, s_h={s_h:.4f}, s_w={s_w:.4f}")
+
                 preprocessed_segments.append({
                     'type': 'generated',
                     'strokes': ls,
@@ -295,6 +301,15 @@ def _draw(
     else:
         s_global = float(manual_size_scale)
 
+    # Compute effective target height for overrides based on actual generated text height
+    # This ensures overrides match the size of surrounding generated text
+    avg_raw_h = sum(raw_heights) / len(raw_heights) if raw_heights else target_h
+    effective_target_h = avg_raw_h * s_global
+
+    # DEBUG: Log key scaling values
+    has_overrides = bool(overrides_dict)
+    print(f"DEBUG _draw: overrides={'ENABLED' if has_overrides else 'DISABLED'}, target_h={target_h:.2f}, s_global={s_global:.4f}, avg_raw_h={avg_raw_h:.2f}, effective_target_h={effective_target_h:.2f}")
+
     # BUGFIX: For small pages where auto_size significantly reduces text scale,
     # adjust line height to be proportional to the actual rendered text size.
     # This prevents huge line spacing when text is scaled down to fit narrow pages.
@@ -371,8 +386,8 @@ def _draw(
                     ls_temp[:, 0] *= x_stretch
                 total_line_width += ls_temp[:, 0].max()
             elif segment.get('type') == 'override':
-                # Apply s_global to match generated text scaling
-                override_width = segment['estimated_width'] * s_global
+                # Scale estimated width to match effective_target_h (was estimated with target_h)
+                override_width = segment['estimated_width'] * (effective_target_h / target_h)
 
                 # Check if there's a space before this override character
                 has_space_before = False
@@ -437,6 +452,7 @@ def _draw(
         for seg_idx, segment in enumerate(preprocessed_segments):
             if segment.get('type') == 'generated':
                 ls = segment['strokes'].copy()
+                raw_h_before_scale = ls[:, 1].max()
                 ls[:, :2] *= s_global
                 if x_stretch != 1.0:
                     ls[:, 0] *= x_stretch
@@ -447,6 +463,10 @@ def _draw(
 
                 # Track segment width before translating
                 segment_width = ls[:, 0].max()
+                segment_height = ls[:, 1].max()
+
+                # DEBUG: Log generated segment dimensions
+                print(f"DEBUG generated: text='{segment.get('text', '')[:20]}', raw_h={raw_h_before_scale:.2f}, final_h={segment_height:.2f}")
 
                 ls[:, 0] += cursor_x
                 ls[:, 1] += line_offset_y
@@ -496,14 +516,13 @@ def _draw(
                     char_height = char_max_y - char_min_y
 
                     # Calculate scale to match generated text height
-                    # Generated text: normalized to start at y=0, height=raw_h, then scaled by s_global
-                    # Final height = raw_h * s_global (which may be < target_h when width-constrained)
-                    # Override should match: char_height * scale = target_h * s_global
-                    effective_target_h = target_h * s_global
+                    # Generated text renders at: raw_h * s_global (NOT target_h!)
+                    # Override should match: char_height * scale = effective_target_h
+                    # where effective_target_h = avg_raw_h * s_global
                     if char_height > 0:
                         scale = effective_target_h / char_height
                     else:
-                        scale = s_global
+                        scale = 1.0
 
                     scale_x = scale * x_stretch
                     scale_y = scale
@@ -516,6 +535,9 @@ def _draw(
                     rendered_width = char_width * scale_x
                     rendered_height = char_height * scale_y
 
+                    # DEBUG: Log override dimensions
+                    print(f"DEBUG override: char='{segment.get('char', '?')}', char_h={char_height:.2f}, scale={scale:.4f}, final_h={rendered_height:.2f}, effective_target_h={effective_target_h:.2f}")
+
                     # Check if there's a space before this override character
                     has_space_before = False
                     if seg_idx > 0:

From bdd439f039d77d064706b4819315f4b40e88439b Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Tue, 20 Jan 2026 21:41:10 +1300
Subject: [PATCH 08/21] Implement placeholder-based override handling: preserve
 RNN context, simplify stroke generation, and dynamically render overrides in
 SVG gaps.

---
 handwriting_synthesis/hand/Hand.py  | 100 ++++----
 handwriting_synthesis/hand/_draw.py | 373 +++++++++++++++++++++++++---
 2 files changed, 380 insertions(+), 93 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 9f24134..629c6c4 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -176,70 +176,56 @@ def _normalize_seq(value, desired_len, cast_fn=None, name='param'):
         stroke_colors = _normalize_seq(stroke_colors, num_lines, str, 'stroke_colors')
         stroke_widths = _normalize_seq(stroke_widths, num_lines, float, 'stroke_widths')
 
-        # Split lines with character overrides
+        # Handle character overrides using SPACE PLACEHOLDER approach
+        # Key insight: Generate full lines with SPACES where overrides go.
+        # The space creates a natural gap in the stroke sequence (pen lift).
+        # We then insert the override SVG into that gap - no stroke clipping needed!
+        # This preserves full RNN context for the surrounding text.
         if overrides_dict:
-            print(f"DEBUG: Processing text with overrides enabled")
-            from handwriting_synthesis.hand.character_override_utils import split_text_with_overrides
+            print(f"DEBUG: Processing text with SPACE-PLACEHOLDER override approach")
 
-            # Create expanded line data with override info
-            line_segments = []
-            texts_to_generate = []
-            segment_to_line_idx = []
+            # Use SPACE as placeholder - creates natural gap in strokes
+            placeholder_char = ' '
 
-            for line_idx, line in enumerate(lines):
-                print(f"DEBUG: Processing line {line_idx}: '{line}'")
-                chunks = split_text_with_overrides(line, overrides_dict)
-                print(f"DEBUG:   Split into {len(chunks)} chunks: {chunks}")
-                line_segment_list = []
+            # Track override positions: {line_idx: [(char_idx, original_char), ...]}
+            override_positions = {}
+            modified_lines = []
 
-                for chunk_text, is_override in chunks:
-                    print(f"DEBUG:     Chunk: '{chunk_text}', is_override={is_override}")
-                    if is_override:
-                        line_segment_list.append({
-                            'type': 'override',
-                            'text': chunk_text,
-                            'line_idx': line_idx
-                        })
+            for line_idx, line in enumerate(lines):
+                override_positions[line_idx] = []
+                modified_line_chars = []
+
+                for char_idx, char in enumerate(line):
+                    if char in overrides_dict:
+                        # Track the position and original character
+                        override_positions[line_idx].append((char_idx, char))
+                        # Replace with SPACE - creates natural gap for override insertion
+                        modified_line_chars.append(placeholder_char)
+                        print(f"DEBUG: Line {line_idx}, char {char_idx}: replacing '{char}' with SPACE placeholder")
                     else:
-                        if chunk_text.strip():  # Only generate non-empty chunks
-                            gen_idx = len(texts_to_generate)
-                            texts_to_generate.append(chunk_text)
-                            segment_to_line_idx.append(line_idx)
-                            line_segment_list.append({
-                                'type': 'generated',
-                                'gen_idx': gen_idx,
-                                'text': chunk_text,
-                                'line_idx': line_idx
-                            })
-                        else:
-                            # Empty space, generate it
-                            gen_idx = len(texts_to_generate)
-                            texts_to_generate.append(chunk_text)
-                            segment_to_line_idx.append(line_idx)
-                            line_segment_list.append({
-                                'type': 'generated',
-                                'gen_idx': gen_idx,
-                                'text': chunk_text,
-                                'line_idx': line_idx
-                            })
+                        modified_line_chars.append(char)
 
-                line_segments.append(line_segment_list)
+                modified_lines.append(''.join(modified_line_chars))
+
+            print(f"DEBUG: Original lines: {lines}")
+            print(f"DEBUG: Modified lines (with placeholders): {modified_lines}")
+            print(f"DEBUG: Override positions: {override_positions}")
 
-            print(f"DEBUG: Texts to generate: {texts_to_generate}")
-
-            # Generate strokes for non-override chunks
-            if texts_to_generate:
-                gen_biases = [biases[idx] if biases else None for idx in segment_to_line_idx]
-                gen_styles = [styles[idx] if styles else None for idx in segment_to_line_idx]
-                generated_strokes = self._sample(texts_to_generate, biases=gen_biases, styles=gen_styles)
-            else:
-                generated_strokes = []
-
-            # Map generated strokes back to segments
-            for line_segment_list in line_segments:
-                for segment in line_segment_list:
-                    if segment['type'] == 'generated':
-                        segment['strokes'] = generated_strokes[segment['gen_idx']]
+            # Generate strokes for FULL lines (like non-override path)
+            # This preserves RNN context - the key improvement!
+            generated_strokes = self._sample(modified_lines, biases=biases, styles=styles)
+
+            # Convert to line_segments format (single segment per line, like non-override)
+            line_segments = []
+            for line_idx, (original_line, strokes) in enumerate(zip(lines, generated_strokes)):
+                line_segments.append([{
+                    'type': 'generated',
+                    'text': original_line,  # Keep original text for reference
+                    'modified_text': modified_lines[line_idx],  # Text that was actually generated
+                    'strokes': strokes,
+                    'line_idx': line_idx,
+                    'override_positions': override_positions[line_idx]  # [(char_idx, char), ...]
+                }])
         else:
             # No overrides, use normal generation
             print(f"DEBUG: No overrides, using normal generation")
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index eda8a17..e296524 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -95,6 +95,210 @@ def _resolve_page_size(page_size, units, num_lines, default_line_height_px):
     return width_px, height_px, svg_size
 
 
+def _compute_inter_segment_spacing(prev_segment, current_segment, reference_height):
+    """
+    Compute spacing to add before current_segment based on the previous segment.
+
+    Args:
+        prev_segment: The previous segment dict (or None if first segment)
+        current_segment: The current segment dict
+        reference_height: Height to use for computing proportional spacing
+
+    Returns:
+        Spacing amount in pixels
+    """
+    if prev_segment is None:
+        return 0.0
+
+    current_type = current_segment.get('type')
+    prev_type = prev_segment.get('type')
+
+    if current_type == 'generated' and prev_type == 'generated':
+        # Generated-to-generated: add spacing based on text boundaries
+        prev_text = prev_segment.get('text', '')
+        current_text = current_segment.get('text', '')
+        has_space = prev_text.endswith(' ') or current_text.startswith(' ')
+        return reference_height * 0.35 if has_space else reference_height * 0.1
+
+    # Override spacing is handled separately in override rendering
+    return 0.0
+
+
+def _render_strokes_with_overrides(
+    dwg, ls, original_text, override_positions, overrides_dict,
+    cursor_x, line_offset_y, s_global, x_stretch, line_scale_x,
+    color, width, target_h
+):
+    """
+    Render generated strokes with override SVGs inserted into natural gaps.
+
+    SPACE PLACEHOLDER APPROACH:
+    The text was generated with SPACES where override characters should be.
+    Spaces create natural gaps in the stroke sequence (pen lifts).
+    We render ALL strokes (they already have gaps), then insert override SVGs
+    into those gaps at calculated positions.
+
+    This is nearly identical to non-override rendering, just with override
+    SVGs added at the right positions.
+
+    Args:
+        dwg: SVG drawing object
+        ls: Stroke coordinates array (already scaled)
+        original_text: Original text of the line (with override chars)
+        override_positions: List of (char_idx, char) tuples for override positions
+        overrides_dict: Dictionary of override character data
+        cursor_x: Starting X position
+        line_offset_y: Y position for this line
+        s_global: Global scale factor
+        x_stretch: Horizontal stretch factor
+        line_scale_x: Line-specific horizontal scale (for overflow prevention)
+        color: Stroke color
+        width: Stroke width
+        target_h: Target height for scaling overrides
+
+    Returns:
+        Final cursor_x position after rendering
+    """
+    from handwriting_synthesis.hand.character_override_utils import get_random_override
+
+    if ls.shape[0] == 0:
+        return cursor_x
+
+    # Calculate dimensions
+    stroke_min_x = ls[:, 0].min()
+    stroke_max_x = ls[:, 0].max()
+    total_stroke_width = stroke_max_x - stroke_min_x
+    stroke_height = ls[:, 1].max()
+    num_chars = len(original_text) if original_text else 1
+    avg_char_width = total_stroke_width / max(1, num_chars)
+
+    print(f"DEBUG render_with_overrides: text='{original_text}', num_chars={num_chars}, total_w={total_stroke_width:.2f}, avg_char_w={avg_char_width:.2f}")
+
+    # Sort override positions by character index
+    sorted_overrides = sorted(override_positions, key=lambda x: x[0])
+
+    # STEP 1: Render ALL strokes exactly like the non-override path
+    # The spaces already created natural gaps - we just render everything
+    ls_render = ls.copy()
+    ls_render[:, 0] += cursor_x - stroke_min_x  # Shift to cursor_x
+    ls_render[:, 1] += line_offset_y
+
+    prev_eos = 1.0
+    commands = []
+    for x, y, eos in zip(*ls_render.T):
+        commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
+        prev_eos = eos
+
+    if commands:
+        p = ' '.join(commands)
+        path = svgwrite.path.Path(p)
+        path = path.stroke(color=color, width=width, linecap='round', linejoin='round', miterlimit=2).fill('none')
+        dwg.add(path)
+
+    # STEP 2: Insert override SVGs at calculated positions (filling the space gaps)
+    for char_idx, override_char in sorted_overrides:
+        # Calculate where this character should be positioned
+        # The space placeholder created a gap here - we fill it with the override
+        char_start_x = cursor_x + (char_idx * avg_char_width)
+
+        print(f"DEBUG override at char_idx={char_idx}, char='{override_char}', calculated_x={char_start_x:.2f}")
+
+        # Get override data
+        override_data = get_random_override(overrides_dict, override_char)
+        if not override_data:
+            print(f"Warning: No override data for '{override_char}'")
+            continue
+
+        # Parse override SVG to get dimensions
+        try:
+            svg_root = ET.fromstring(override_data['svg_data'])
+            all_x_coords = []
+            all_y_coords = []
+
+            for elem in svg_root.iter():
+                tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
+                if tag_name == 'path':
+                    d = elem.get('d', '')
+                    coords = re.findall(r'[ML]\s*([-\d.]+)\s+([-\d.]+)', d)
+                    for x, y in coords:
+                        all_x_coords.append(float(x))
+                        all_y_coords.append(float(y))
+
+            if not all_x_coords or not all_y_coords:
+                print(f"Warning: No coordinates found for override '{override_char}'")
+                continue
+
+            char_min_x = min(all_x_coords)
+            char_max_x = max(all_x_coords)
+            char_min_y = min(all_y_coords)
+            char_max_y = max(all_y_coords)
+
+            char_width = char_max_x - char_min_x
+            char_height = char_max_y - char_min_y
+
+            # Calculate scale to match stroke height
+            if char_height > 0:
+                scale = stroke_height / char_height
+            else:
+                scale = 1.0
+
+            scale_x = scale * x_stretch * line_scale_x
+            scale_y = scale
+
+            # Rendered dimensions
+            rendered_width = char_width * scale_x
+
+            print(f"DEBUG override render: char='{override_char}', char_h={char_height:.2f}, scale={scale:.4f}, rendered_w={rendered_width:.2f}, gap_w={avg_char_width:.2f}")
+
+            # Center the override in the space gap
+            # Gap is avg_char_width wide, override is rendered_width wide
+            gap_center_x = char_start_x + (avg_char_width / 2.0)
+            override_start_x = gap_center_x - (rendered_width / 2.0)
+
+            # Position override SVG
+            pos_x = override_start_x - (char_min_x * scale_x)
+            pos_y = line_offset_y - (char_min_y * scale_y)
+
+            # Create group with transform
+            g = dwg.g(transform=f"translate({pos_x},{pos_y}) scale({scale_x},{scale_y})")
+
+            # Add paths from override SVG
+            for elem in svg_root.iter():
+                tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
+                if tag_name == 'path':
+                    d = elem.get('d')
+                    if not d:
+                        continue
+
+                    orig_stroke = elem.get('stroke', 'none')
+                    path = dwg.path(d=d)
+
+                    if orig_stroke and orig_stroke.lower() not in ('none', 'transparent'):
+                        avg_scale = (scale_x + scale_y) / 2.0
+                        adjusted_stroke_width = width / avg_scale if avg_scale > 0 else width
+                        path = path.stroke(
+                            color=color,
+                            width=adjusted_stroke_width,
+                            linecap='round',
+                            linejoin='round'
+                        ).fill('none')
+                    else:
+                        path = path.fill(color)
+
+                    g.add(path)
+
+            dwg.add(g)
+
+        except Exception as e:
+            print(f"Error rendering override '{override_char}': {e}")
+            import traceback
+            traceback.print_exc()
+
+    # Return final X position (same as total stroke width since we rendered everything)
+    final_x = cursor_x + total_stroke_width
+    return final_x
+
+
 def _draw(
     line_segments,  # Changed from 'strokes' to 'line_segments'
     lines,
@@ -288,9 +492,11 @@ def _draw(
                 preprocessed_segments.append({
                     'type': 'generated',
                     'strokes': ls,
+                    'raw_h': raw_h,  # Store for adjacent override sizing
                     'color': color,
                     'width': width,
-                    'text': segment.get('text', '')  # Add original text for spacing checks
+                    'text': segment.get('text', ''),  # Add original text for spacing checks
+                    'override_positions': segment.get('override_positions', [])  # Preserve override positions for placeholder approach
                 })
 
         preprocessed_lines.append(preprocessed_segments if preprocessed_segments else [{'empty': True}])
@@ -384,10 +590,52 @@ def _draw(
                 ls_temp[:, :2] *= s_global
                 if x_stretch != 1.0:
                     ls_temp[:, 0] *= x_stretch
-                total_line_width += ls_temp[:, 0].max()
+                segment_height = ls_temp[:, 1].max()
+                segment_width = ls_temp[:, 0].max()
+
+                # Add inter-segment spacing
+                prev_seg = preprocessed_segments[seg_idx - 1] if seg_idx > 0 else None
+                spacing = _compute_inter_segment_spacing(prev_seg, segment, segment_height)
+                total_line_width += spacing + segment_width
+
+                # For placeholder approach: adjust width for override character size differences
+                override_positions = segment.get('override_positions', [])
+                if override_positions and overrides_dict:
+                    from handwriting_synthesis.hand.character_override_utils import get_random_override, estimate_override_width
+                    original_text = segment.get('text', '')
+                    num_chars = len(original_text) if original_text else 1
+                    avg_char_width = segment_width / max(1, num_chars)
+
+                    # Calculate width adjustment for each override
+                    for char_idx, override_char in override_positions:
+                        override_data = get_random_override(overrides_dict, override_char)
+                        if override_data:
+                            # Estimate override width at current scale
+                            override_width = estimate_override_width(override_data, segment_height, x_stretch)
+                            # Width difference: override width minus placeholder width
+                            width_diff = override_width - avg_char_width
+                            total_line_width += width_diff
+                            print(f"DEBUG width calc: override '{override_char}' width_diff={width_diff:.2f}")
+
             elif segment.get('type') == 'override':
-                # Scale estimated width to match effective_target_h (was estimated with target_h)
-                override_width = segment['estimated_width'] * (effective_target_h / target_h)
+                # Scale estimated width using ADJACENT segment heights (same as rendering)
+                adjacent_raw_heights = []
+                if seg_idx > 0:
+                    prev_seg = preprocessed_segments[seg_idx - 1]
+                    if prev_seg.get('type') == 'generated' and 'raw_h' in prev_seg:
+                        adjacent_raw_heights.append(prev_seg['raw_h'])
+                if seg_idx < len(preprocessed_segments) - 1:
+                    next_seg = preprocessed_segments[seg_idx + 1]
+                    if next_seg.get('type') == 'generated' and 'raw_h' in next_seg:
+                        adjacent_raw_heights.append(next_seg['raw_h'])
+
+                if adjacent_raw_heights:
+                    local_raw_h = sum(adjacent_raw_heights) / len(adjacent_raw_heights)
+                    local_effective_target_h = local_raw_h * s_global
+                else:
+                    local_effective_target_h = effective_target_h
+
+                override_width = segment['estimated_width'] * (local_effective_target_h / target_h)
 
                 # Check if there's a space before this override character
                 has_space_before = False
@@ -451,38 +699,76 @@ def _draw(
         cursor_x = line_offset_x
         for seg_idx, segment in enumerate(preprocessed_segments):
             if segment.get('type') == 'generated':
-                ls = segment['strokes'].copy()
-                raw_h_before_scale = ls[:, 1].max()
-                ls[:, :2] *= s_global
-                if x_stretch != 1.0:
-                    ls[:, 0] *= x_stretch
+                # Check if this segment uses the placeholder-based override approach
+                override_positions = segment.get('override_positions', [])
+
+                if override_positions and overrides_dict:
+                    # NEW PLACEHOLDER APPROACH: Use unified rendering with override insertion
+                    print(f"DEBUG: Using placeholder-based rendering for segment with {len(override_positions)} overrides")
+
+                    ls = segment['strokes'].copy()
+                    ls[:, :2] *= s_global
+                    if x_stretch != 1.0:
+                        ls[:, 0] *= x_stretch
+                    if line_scale_x != 1.0:
+                        ls[:, 0] *= line_scale_x
+
+                    segment_height = ls[:, 1].max()
+
+                    cursor_x = _render_strokes_with_overrides(
+                        dwg=dwg,
+                        ls=ls,
+                        original_text=segment.get('text', ''),
+                        override_positions=override_positions,
+                        overrides_dict=overrides_dict,
+                        cursor_x=cursor_x,
+                        line_offset_y=line_offset_y,
+                        s_global=s_global,
+                        x_stretch=x_stretch,
+                        line_scale_x=line_scale_x,
+                        color=segment['color'],
+                        width=segment['width'],
+                        target_h=segment_height
+                    )
+                else:
+                    # STANDARD PATH: No overrides, render normally
+                    ls = segment['strokes'].copy()
+                    raw_h_before_scale = ls[:, 1].max()
+                    ls[:, :2] *= s_global
+                    if x_stretch != 1.0:
+                        ls[:, 0] *= x_stretch
 
-                # Apply line-specific horizontal scaling to prevent overflow
-                if line_scale_x != 1.0:
-                    ls[:, 0] *= line_scale_x
+                    # Apply line-specific horizontal scaling to prevent overflow
+                    if line_scale_x != 1.0:
+                        ls[:, 0] *= line_scale_x
 
-                # Track segment width before translating
-                segment_width = ls[:, 0].max()
-                segment_height = ls[:, 1].max()
+                    # Track segment width before translating
+                    segment_width = ls[:, 0].max()
+                    segment_height = ls[:, 1].max()
 
-                # DEBUG: Log generated segment dimensions
-                print(f"DEBUG generated: text='{segment.get('text', '')[:20]}', raw_h={raw_h_before_scale:.2f}, final_h={segment_height:.2f}")
+                    # Add inter-segment spacing
+                    prev_seg = preprocessed_segments[seg_idx - 1] if seg_idx > 0 else None
+                    spacing = _compute_inter_segment_spacing(prev_seg, segment, segment_height)
+                    cursor_x += spacing
 
-                ls[:, 0] += cursor_x
-                ls[:, 1] += line_offset_y
+                    # DEBUG: Log generated segment dimensions
+                    print(f"DEBUG generated: text='{segment.get('text', '')[:20]}', raw_h={raw_h_before_scale:.2f}, final_h={segment_height:.2f}")
 
-                prev_eos = 1.0
-                commands = []
-                for x, y, eos in zip(*ls.T):
-                    commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
-                    prev_eos = eos
-                p = ' '.join(commands)
-                path = svgwrite.path.Path(p)
-                path = path.stroke(color=segment['color'], width=segment['width'], linecap='round', linejoin='round', miterlimit=2).fill('none')
-                dwg.add(path)
+                    ls[:, 0] += cursor_x
+                    ls[:, 1] += line_offset_y
 
-                # Advance cursor by segment width
-                cursor_x += segment_width
+                    prev_eos = 1.0
+                    commands = []
+                    for x, y, eos in zip(*ls.T):
+                        commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
+                        prev_eos = eos
+                    p = ' '.join(commands)
+                    path = svgwrite.path.Path(p)
+                    path = path.stroke(color=segment['color'], width=segment['width'], linecap='round', linejoin='round', miterlimit=2).fill('none')
+                    dwg.add(path)
+
+                    # Advance cursor by segment width
+                    cursor_x += segment_width
 
             elif segment.get('type') == 'override':
                 override_data = segment['override_data']
@@ -515,12 +801,27 @@ def _draw(
                     char_width = char_max_x - char_min_x
                     char_height = char_max_y - char_min_y
 
-                    # Calculate scale to match generated text height
-                    # Generated text renders at: raw_h * s_global (NOT target_h!)
-                    # Override should match: char_height * scale = effective_target_h
-                    # where effective_target_h = avg_raw_h * s_global
+                    # Calculate scale to match ADJACENT generated text height
+                    # Use raw_h from neighboring segments for better local matching
+                    adjacent_raw_heights = []
+                    if seg_idx > 0:
+                        prev_seg = preprocessed_segments[seg_idx - 1]
+                        if prev_seg.get('type') == 'generated' and 'raw_h' in prev_seg:
+                            adjacent_raw_heights.append(prev_seg['raw_h'])
+                    if seg_idx < len(preprocessed_segments) - 1:
+                        next_seg = preprocessed_segments[seg_idx + 1]
+                        if next_seg.get('type') == 'generated' and 'raw_h' in next_seg:
+                            adjacent_raw_heights.append(next_seg['raw_h'])
+
+                    # Use adjacent average if available, otherwise fall back to global
+                    if adjacent_raw_heights:
+                        local_raw_h = sum(adjacent_raw_heights) / len(adjacent_raw_heights)
+                        local_effective_target_h = local_raw_h * s_global
+                    else:
+                        local_effective_target_h = effective_target_h
+
                     if char_height > 0:
-                        scale = effective_target_h / char_height
+                        scale = local_effective_target_h / char_height
                     else:
                         scale = 1.0
 
@@ -536,7 +837,7 @@ def _draw(
                     rendered_height = char_height * scale_y
 
                     # DEBUG: Log override dimensions
-                    print(f"DEBUG override: char='{segment.get('char', '?')}', char_h={char_height:.2f}, scale={scale:.4f}, final_h={rendered_height:.2f}, effective_target_h={effective_target_h:.2f}")
+                    print(f"DEBUG override: char='{segment.get('char', '?')}', char_h={char_height:.2f}, scale={scale:.4f}, final_h={rendered_height:.2f}, local_target_h={local_effective_target_h:.2f}, adjacent_raw_h={adjacent_raw_heights}")
 
                     # Check if there's a space before this override character
                     has_space_before = False

From 02af4ea2479280d9f972ddbab4c9190a136f474b Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Tue, 20 Jan 2026 21:45:09 +1300
Subject: [PATCH 09/21] Refactor override rendering: implement clipping for
 placeholder strokes, ensure clean SVG insertion, and improve debug logs for
 exclusion zones and rendering steps.

---
 handwriting_synthesis/hand/_draw.py | 73 ++++++++++++++++-------------
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index e296524..91f1ce7 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -130,16 +130,17 @@ def _render_strokes_with_overrides(
     color, width, target_h
 ):
     """
-    Render generated strokes with override SVGs inserted into natural gaps.
+    Render generated strokes with override SVGs inserted at calculated positions.
 
-    SPACE PLACEHOLDER APPROACH:
+    SPACE PLACEHOLDER + CLIPPING APPROACH:
     The text was generated with SPACES where override characters should be.
-    Spaces create natural gaps in the stroke sequence (pen lifts).
-    We render ALL strokes (they already have gaps), then insert override SVGs
-    into those gaps at calculated positions.
+    We CLIP OUT any stroke points that fall within the override character zones,
+    then insert override SVGs at those positions.
 
-    This is nearly identical to non-override rendering, just with override
-    SVGs added at the right positions.
+    This ensures:
+    1. Full RNN context for surrounding text (space is a valid character)
+    2. No artifacts from placeholder strokes (we clip them out)
+    3. Clean override insertion
 
     Args:
         dwg: SVG drawing object
@@ -177,8 +178,16 @@ def _render_strokes_with_overrides(
     # Sort override positions by character index
     sorted_overrides = sorted(override_positions, key=lambda x: x[0])
 
-    # STEP 1: Render ALL strokes exactly like the non-override path
-    # The spaces already created natural gaps - we just render everything
+    # Build exclusion zones (X ranges to clip out) for each override position
+    exclusion_zones = []
+    for char_idx, override_char in sorted_overrides:
+        # Zone where placeholder strokes should be clipped
+        zone_start = char_idx * avg_char_width
+        zone_end = (char_idx + 1) * avg_char_width
+        exclusion_zones.append((zone_start, zone_end))
+        print(f"DEBUG exclusion zone for '{override_char}': char_idx={char_idx}, zone=[{zone_start:.2f}, {zone_end:.2f}]")
+
+    # STEP 1: Render strokes, CLIPPING OUT points within exclusion zones
     ls_render = ls.copy()
     ls_render[:, 0] += cursor_x - stroke_min_x  # Shift to cursor_x
     ls_render[:, 1] += line_offset_y
@@ -186,8 +195,22 @@ def _render_strokes_with_overrides(
     prev_eos = 1.0
     commands = []
     for x, y, eos in zip(*ls_render.T):
-        commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
-        prev_eos = eos
+        # Calculate relative X position from stroke start
+        rel_x = x - cursor_x
+
+        # Check if this point is within any exclusion zone
+        in_exclusion = False
+        for zone_start, zone_end in exclusion_zones:
+            if zone_start <= rel_x <= zone_end:
+                in_exclusion = True
+                break
+
+        if in_exclusion:
+            # Skip this point, mark as stroke break so next point starts a new path
+            prev_eos = 1.0
+        else:
+            commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
+            prev_eos = eos
 
     if commands:
         p = ' '.join(commands)
@@ -195,7 +218,7 @@ def _render_strokes_with_overrides(
         path = path.stroke(color=color, width=width, linecap='round', linejoin='round', miterlimit=2).fill('none')
         dwg.add(path)
 
-    # STEP 2: Insert override SVGs at calculated positions (filling the space gaps)
+    # STEP 2: Insert override SVGs at calculated positions
     for char_idx, override_char in sorted_overrides:
         # Calculate where this character should be positioned
         # The space placeholder created a gap here - we fill it with the override
@@ -598,24 +621,9 @@ def _draw(
                 spacing = _compute_inter_segment_spacing(prev_seg, segment, segment_height)
                 total_line_width += spacing + segment_width
 
-                # For placeholder approach: adjust width for override character size differences
-                override_positions = segment.get('override_positions', [])
-                if override_positions and overrides_dict:
-                    from handwriting_synthesis.hand.character_override_utils import get_random_override, estimate_override_width
-                    original_text = segment.get('text', '')
-                    num_chars = len(original_text) if original_text else 1
-                    avg_char_width = segment_width / max(1, num_chars)
-
-                    # Calculate width adjustment for each override
-                    for char_idx, override_char in override_positions:
-                        override_data = get_random_override(overrides_dict, override_char)
-                        if override_data:
-                            # Estimate override width at current scale
-                            override_width = estimate_override_width(override_data, segment_height, x_stretch)
-                            # Width difference: override width minus placeholder width
-                            width_diff = override_width - avg_char_width
-                            total_line_width += width_diff
-                            print(f"DEBUG width calc: override '{override_char}' width_diff={width_diff:.2f}")
+                # SPACE PLACEHOLDER APPROACH: No width adjustment needed
+                # The strokes already have natural gaps where spaces are, and we just fill them.
+                # The total width is the stroke width as-is.
 
             elif segment.get('type') == 'override':
                 # Scale estimated width using ADJACENT segment heights (same as rendering)
@@ -703,8 +711,9 @@ def _draw(
                 override_positions = segment.get('override_positions', [])
 
                 if override_positions and overrides_dict:
-                    # NEW PLACEHOLDER APPROACH: Use unified rendering with override insertion
-                    print(f"DEBUG: Using placeholder-based rendering for segment with {len(override_positions)} overrides")
+                    # SPACE PLACEHOLDER APPROACH: Render strokes normally (spaces create gaps),
+                    # then fill those gaps with override SVGs
+                    print(f"DEBUG: Using SPACE PLACEHOLDER rendering for segment with {len(override_positions)} overrides")
 
                     ls = segment['strokes'].copy()
                     ls[:, :2] *= s_global

From 611dd9df2c574cb6c10ed8145eac0b58fcac19b6 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Tue, 20 Jan 2026 22:27:29 +1300
Subject: [PATCH 10/21] Enhance stroke generation: integrate attention-based
 character indices for precise override positioning, implement model-level
 cutting, and refine rendering logic for transitions.

---
 handwriting_synthesis/hand/Hand.py            |  31 +-
 handwriting_synthesis/hand/_draw.py           | 366 +++++++++++++-----
 .../hand/operations/sampling.py               |  62 ++-
 handwriting_synthesis/rnn/RNN.py              |  37 +-
 4 files changed, 375 insertions(+), 121 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 629c6c4..eaa7133 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -211,18 +211,27 @@ def _normalize_seq(value, desired_len, cast_fn=None, name='param'):
             print(f"DEBUG: Modified lines (with placeholders): {modified_lines}")
             print(f"DEBUG: Override positions: {override_positions}")
 
-            # Generate strokes for FULL lines (like non-override path)
-            # This preserves RNN context - the key improvement!
-            generated_strokes = self._sample(modified_lines, biases=biases, styles=styles)
+            # Generate strokes for FULL lines with CHAR INDICES from attention
+            # This gives us precise knowledge of where each character was written!
+            generated_strokes, char_indices_list = self._sample(
+                modified_lines, biases=biases, styles=styles, return_char_indices=True
+            )
+
+            print(f"DEBUG: Got char_indices for {len(char_indices_list)} lines")
+            for i, ci in enumerate(char_indices_list):
+                print(f"DEBUG:   Line {i}: {len(ci)} char indices, range [{ci.min() if len(ci) > 0 else 'N/A'}, {ci.max() if len(ci) > 0 else 'N/A'}]")
 
             # Convert to line_segments format (single segment per line, like non-override)
             line_segments = []
-            for line_idx, (original_line, strokes) in enumerate(zip(lines, generated_strokes)):
+            for line_idx, (original_line, strokes, char_indices) in enumerate(
+                zip(lines, generated_strokes, char_indices_list)
+            ):
                 line_segments.append([{
                     'type': 'generated',
                     'text': original_line,  # Keep original text for reference
                     'modified_text': modified_lines[line_idx],  # Text that was actually generated
                     'strokes': strokes,
+                    'char_indices': char_indices,  # NEW: Character index per stroke from attention
                     'line_idx': line_idx,
                     'override_positions': override_positions[line_idx]  # [(char_idx, char), ...]
                 }])
@@ -271,7 +280,7 @@ def _normalize_seq(value, desired_len, cast_fn=None, name='param'):
             margin_jitter_coherence=margin_jitter_coherence,
         )
 
-    def _sample(self, lines, biases=None, styles=None):
+    def _sample(self, lines, biases=None, styles=None, return_char_indices=False):
         """
         Sample stroke sequences from the RNN.
 
@@ -279,11 +288,19 @@ def _sample(self, lines, biases=None, styles=None):
             lines: List of text lines
             biases: Optional biases
             styles: Optional styles
+            return_char_indices: If True, also return character indices per stroke
+                                (from the attention mechanism)
 
         Returns:
-            List of stroke sequences
+            If return_char_indices is False:
+                List of stroke sequences
+            If return_char_indices is True:
+                Tuple of (strokes_list, char_indices_list)
         """
-        return sample_strokes(self.nn.session, self.nn, lines, biases, styles)
+        return sample_strokes(
+            self.nn.session, self.nn, lines, biases, styles,
+            return_char_indices=return_char_indices
+        )
 
     def write_chunked(
         self,
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 91f1ce7..c1761af 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -127,20 +127,22 @@ def _compute_inter_segment_spacing(prev_segment, current_segment, reference_heig
 def _render_strokes_with_overrides(
     dwg, ls, original_text, override_positions, overrides_dict,
     cursor_x, line_offset_y, s_global, x_stretch, line_scale_x,
-    color, width, target_h
+    color, width, target_h, char_indices=None
 ):
     """
-    Render generated strokes with override SVGs inserted at calculated positions.
+    Render generated strokes with override SVGs inserted at precise positions.
 
-    SPACE PLACEHOLDER + CLIPPING APPROACH:
+    MODEL-LEVEL CHAR INDEX APPROACH WITH GAP CREATION:
     The text was generated with SPACES where override characters should be.
-    We CLIP OUT any stroke points that fall within the override character zones,
-    then insert override SVGs at those positions.
+    We use the model's attention-based char_indices to know EXACTLY which
+    strokes correspond to each character. Since spaces create minimal horizontal
+    movement, we SHIFT subsequent strokes to CREATE ROOM for the override.
 
     This ensures:
     1. Full RNN context for surrounding text (space is a valid character)
-    2. No artifacts from placeholder strokes (we clip them out)
-    3. Clean override insertion
+    2. PRECISE cuts based on model's internal knowledge
+    3. PROPER SPACING by shifting strokes to make room for overrides
+    4. Clean override insertion at natural positions
 
     Args:
         dwg: SVG drawing object
@@ -156,6 +158,8 @@ def _render_strokes_with_overrides(
         color: Stroke color
         width: Stroke width
         target_h: Target height for scaling overrides
+        char_indices: Array of character indices per stroke (from model attention).
+                      If provided, uses precise cutting; otherwise falls back to estimation.
 
     Returns:
         Final cursor_x position after rendering
@@ -171,62 +175,38 @@ def _render_strokes_with_overrides(
     total_stroke_width = stroke_max_x - stroke_min_x
     stroke_height = ls[:, 1].max()
     num_chars = len(original_text) if original_text else 1
-    avg_char_width = total_stroke_width / max(1, num_chars)
 
-    print(f"DEBUG render_with_overrides: text='{original_text}', num_chars={num_chars}, total_w={total_stroke_width:.2f}, avg_char_w={avg_char_width:.2f}")
-
-    # Sort override positions by character index
+    # Sort override positions by character index (process left to right)
     sorted_overrides = sorted(override_positions, key=lambda x: x[0])
 
-    # Build exclusion zones (X ranges to clip out) for each override position
-    exclusion_zones = []
-    for char_idx, override_char in sorted_overrides:
-        # Zone where placeholder strokes should be clipped
-        zone_start = char_idx * avg_char_width
-        zone_end = (char_idx + 1) * avg_char_width
-        exclusion_zones.append((zone_start, zone_end))
-        print(f"DEBUG exclusion zone for '{override_char}': char_idx={char_idx}, zone=[{zone_start:.2f}, {zone_end:.2f}]")
-
-    # STEP 1: Render strokes, CLIPPING OUT points within exclusion zones
-    ls_render = ls.copy()
-    ls_render[:, 0] += cursor_x - stroke_min_x  # Shift to cursor_x
-    ls_render[:, 1] += line_offset_y
+    # Determine if we can use precise char_indices
+    use_precise_indices = (
+        char_indices is not None and
+        len(char_indices) == ls.shape[0]
+    )
+
+    # Calculate average character width for sizing overrides
+    # Exclude override positions from calculation
+    if use_precise_indices:
+        non_override_chars = set(range(num_chars)) - set(ci for ci, _ in sorted_overrides)
+        char_widths = []
+        for ci in non_override_chars:
+            matching = np.where(char_indices == ci)[0]
+            if len(matching) > 1:
+                w = ls[matching[-1], 0] - ls[matching[0], 0]
+                if w > 0:
+                    char_widths.append(w)
+        avg_char_width = np.mean(char_widths) if char_widths else total_stroke_width / max(1, num_chars)
+    else:
+        avg_char_width = total_stroke_width / max(1, num_chars)
 
-    prev_eos = 1.0
-    commands = []
-    for x, y, eos in zip(*ls_render.T):
-        # Calculate relative X position from stroke start
-        rel_x = x - cursor_x
-
-        # Check if this point is within any exclusion zone
-        in_exclusion = False
-        for zone_start, zone_end in exclusion_zones:
-            if zone_start <= rel_x <= zone_end:
-                in_exclusion = True
-                break
-
-        if in_exclusion:
-            # Skip this point, mark as stroke break so next point starts a new path
-            prev_eos = 1.0
-        else:
-            commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
-            prev_eos = eos
+    print(f"DEBUG render_with_overrides: avg_char_width={avg_char_width:.2f}")
 
-    if commands:
-        p = ' '.join(commands)
-        path = svgwrite.path.Path(p)
-        path = path.stroke(color=color, width=width, linecap='round', linejoin='round', miterlimit=2).fill('none')
-        dwg.add(path)
+    # STEP 1: Calculate override widths and insertion points
+    override_info = []  # List of override details
 
-    # STEP 2: Insert override SVGs at calculated positions
     for char_idx, override_char in sorted_overrides:
-        # Calculate where this character should be positioned
-        # The space placeholder created a gap here - we fill it with the override
-        char_start_x = cursor_x + (char_idx * avg_char_width)
-
-        print(f"DEBUG override at char_idx={char_idx}, char='{override_char}', calculated_x={char_start_x:.2f}")
-
-        # Get override data
+        # Get override data and calculate its rendered width
         override_data = get_random_override(overrides_dict, override_char)
         if not override_data:
             print(f"Warning: No override data for '{override_char}'")
@@ -267,25 +247,213 @@ def _render_strokes_with_overrides(
 
             scale_x = scale * x_stretch * line_scale_x
             scale_y = scale
-
-            # Rendered dimensions
             rendered_width = char_width * scale_x
 
-            print(f"DEBUG override render: char='{override_char}', char_h={char_height:.2f}, scale={scale:.4f}, rendered_w={rendered_width:.2f}, gap_w={avg_char_width:.2f}")
+            # Find insertion point and EXPANDED stroke range using char_indices
+            # We expand the range to include transition strokes (buffer zone)
+            stroke_range = None
+            exclusion_range = None  # Expanded range for excluding transition strokes
+
+            if use_precise_indices:
+                matching_strokes = np.where(char_indices == char_idx)[0]
+                if len(matching_strokes) > 0:
+                    start_idx = matching_strokes[0]
+                    end_idx = matching_strokes[-1]
+                    stroke_range = (start_idx, end_idx)
+
+                    # COMBINED BUFFER APPROACH:
+                    # 1. Stroke index based buffer
+                    # 2. X-position based exclusion zone
+                    # 3. Pen-up (eos) based extension
+
+                    # Calculate buffer size based on average strokes per character
+                    total_strokes = ls.shape[0]
+                    avg_strokes_per_char = total_strokes / max(1, num_chars)
+                    # Buffer: ~50% of average character's strokes on each side (more aggressive)
+                    stroke_buffer = int(max(5, avg_strokes_per_char * 0.5))
+
+                    # Expand range by stroke buffer
+                    expanded_start = max(0, start_idx - stroke_buffer)
+                    expanded_end = min(ls.shape[0] - 1, end_idx + stroke_buffer)
+
+                    # Also create an X-position exclusion zone
+                    # Find the X range of the space strokes
+                    space_x_coords = ls[start_idx:end_idx+1, 0]
+                    space_x_min = space_x_coords.min() if len(space_x_coords) > 0 else ls[start_idx, 0]
+                    space_x_max = space_x_coords.max() if len(space_x_coords) > 0 else ls[start_idx, 0]
+
+                    # Expand X zone by 0.5 * avg_char_width on each side
+                    x_buffer = avg_char_width * 0.5
+                    exclusion_x_min = space_x_min - x_buffer
+                    exclusion_x_max = space_x_max + x_buffer
+
+                    # Now expand stroke range to include ANY stroke with X in the exclusion zone
+                    # Search backward from expanded_start
+                    while expanded_start > 0:
+                        prev_x = ls[expanded_start - 1, 0]
+                        prev_eos = ls[expanded_start - 1, 2]
+                        # Include if X is in zone OR if it's a pen-up transition
+                        if exclusion_x_min <= prev_x <= exclusion_x_max or prev_eos > 0.5:
+                            expanded_start -= 1
+                        else:
+                            break
+
+                    # Search forward from expanded_end
+                    while expanded_end < ls.shape[0] - 1:
+                        next_x = ls[expanded_end + 1, 0]
+                        next_eos = ls[expanded_end, 2]  # Current stroke's eos indicates break after
+                        # Include if X is in zone OR if current is pen-up
+                        if exclusion_x_min <= next_x <= exclusion_x_max or next_eos > 0.5:
+                            expanded_end += 1
+                        else:
+                            break
+
+                    exclusion_range = (expanded_start, expanded_end)
+                    num_excluded = expanded_end - expanded_start + 1
+                    print(f"DEBUG: Expanded exclusion range from [{start_idx}, {end_idx}] to [{expanded_start}, {expanded_end}] ({num_excluded} strokes, buffer={stroke_buffer})")
+
+                    insertion_x = ls[start_idx, 0]
+                else:
+                    insertion_x = stroke_min_x + (char_idx * avg_char_width)
+                    exclusion_range = None
+            else:
+                insertion_x = stroke_min_x + (char_idx * avg_char_width)
+                exclusion_range = None
+
+            override_info.append({
+                'char_idx': char_idx,
+                'override_char': override_char,
+                'insertion_x': insertion_x,
+                'override_width': rendered_width,
+                'stroke_range': stroke_range,
+                'exclusion_range': exclusion_range,  # Expanded range for transition strokes
+                'override_data': override_data,
+                'char_min_x': char_min_x,
+                'char_min_y': char_min_y,
+                'scale_x': scale_x,
+                'scale_y': scale_y,
+            })
+
+            print(f"DEBUG: Override '{override_char}' at char_idx={char_idx}: insertion_x={insertion_x:.2f}, width={rendered_width:.2f}")
+
+        except Exception as e:
+            print(f"Error processing override '{override_char}': {e}")
+            continue
+
+    # STEP 2: Build shifted stroke coordinates
+    # We need to shift strokes AFTER each override to make room
+    ls_shifted = ls.copy()
+
+    # Calculate cumulative shift needed at each stroke position
+    cumulative_shift = np.zeros(ls.shape[0])
+
+    # Build set of all stroke indices to exclude (using expanded exclusion ranges)
+    excluded_stroke_indices = set()
+
+    for info in override_info:
+        char_idx = info['char_idx']
+        override_width = info['override_width']
+        exclusion_range = info.get('exclusion_range') or info.get('stroke_range')
+
+        # Add small spacing around override (like natural character spacing)
+        spacing = avg_char_width * 0.15
+        total_shift = override_width + spacing * 2
+
+        if use_precise_indices and exclusion_range is not None:
+            start_idx, end_idx = exclusion_range
+            # Add all strokes in exclusion range to the set
+            for idx in range(start_idx, end_idx + 1):
+                excluded_stroke_indices.add(idx)
+            # Shift all strokes AFTER the exclusion range
+            cumulative_shift[end_idx + 1:] += total_shift
+            print(f"DEBUG: Excluding strokes [{start_idx}, {end_idx}], shifting after by {total_shift:.2f}")
+        else:
+            # Fallback: shift based on X position
+            insertion_x = info['insertion_x']
+            mask = ls[:, 0] > insertion_x
+            cumulative_shift[mask] += total_shift
+
+    # Apply shifts to X coordinates
+    ls_shifted[:, 0] += cumulative_shift
+
+    # Recalculate total width after shifting
+    total_shifted_width = ls_shifted[:, 0].max() - ls_shifted[:, 0].min()
+
+    # STEP 3: Render strokes (excluding override positions AND transition strokes)
+    ls_render = ls_shifted.copy()
+    shifted_min_x = ls_shifted[:, 0].min()
+    ls_render[:, 0] += cursor_x - shifted_min_x
+    ls_render[:, 1] += line_offset_y
 
-            # Center the override in the space gap
-            # Gap is avg_char_width wide, override is rendered_width wide
-            gap_center_x = char_start_x + (avg_char_width / 2.0)
-            override_start_x = gap_center_x - (rendered_width / 2.0)
+    prev_eos = 1.0
+    commands = []
 
-            # Position override SVG
-            pos_x = override_start_x - (char_min_x * scale_x)
-            pos_y = line_offset_y - (char_min_y * scale_y)
+    if use_precise_indices:
+        # Use the expanded exclusion set (includes transition strokes)
+        for stroke_idx, (x, y, eos) in enumerate(zip(*ls_render.T)):
+            if stroke_idx in excluded_stroke_indices:
+                # Skip this stroke, mark as stroke break
+                prev_eos = 1.0
+            else:
+                commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
+                prev_eos = eos
+    else:
+        # Fallback using exclusion zones
+        exclusion_zones = []
+        for info in override_info:
+            zone_start = info['insertion_x'] - shifted_min_x
+            zone_end = zone_start + info['override_width']
+            exclusion_zones.append((zone_start, zone_end))
+
+        for x, y, eos in zip(*ls_render.T):
+            rel_x = x - cursor_x
+            in_exclusion = any(start <= rel_x <= end for start, end in exclusion_zones)
+            if in_exclusion:
+                prev_eos = 1.0
+            else:
+                commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
+                prev_eos = eos
 
-            # Create group with transform
-            g = dwg.g(transform=f"translate({pos_x},{pos_y}) scale({scale_x},{scale_y})")
+    if commands:
+        p = ' '.join(commands)
+        path = svgwrite.path.Path(p)
+        path = path.stroke(color=color, width=width, linecap='round', linejoin='round', miterlimit=2).fill('none')
+        dwg.add(path)
+
+    # STEP 4: Insert override SVGs at calculated positions (accounting for shifts)
+    running_shift = 0.0
+    for info in override_info:
+        char_idx = info['char_idx']
+        override_char = info['override_char']
+        override_data = info['override_data']
+        override_width = info['override_width']
+        stroke_range = info['stroke_range']
+
+        spacing = avg_char_width * 0.15
+
+        # Calculate position accounting for previous shifts
+        if use_precise_indices and stroke_range is not None:
+            start_idx, end_idx = stroke_range
+            # Use the shifted position
+            base_x = ls_shifted[start_idx, 0] - shifted_min_x + cursor_x
+        else:
+            base_x = info['insertion_x'] - stroke_min_x + cursor_x + running_shift
 
-            # Add paths from override SVG
+        # Add spacing before the override
+        override_start_x = base_x + spacing
+
+        # Position override SVG
+        pos_x = override_start_x - (info['char_min_x'] * info['scale_x'])
+        pos_y = line_offset_y - (info['char_min_y'] * info['scale_y'])
+
+        print(f"DEBUG: Rendering override '{override_char}' at pos_x={pos_x:.2f}")
+
+        # Create group with transform
+        g = dwg.g(transform=f"translate({pos_x},{pos_y}) scale({info['scale_x']},{info['scale_y']})")
+
+        # Add paths from override SVG
+        try:
+            svg_root = ET.fromstring(override_data['svg_data'])
             for elem in svg_root.iter():
                 tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
                 if tag_name == 'path':
@@ -297,7 +465,7 @@ def _render_strokes_with_overrides(
                     path = dwg.path(d=d)
 
                     if orig_stroke and orig_stroke.lower() not in ('none', 'transparent'):
-                        avg_scale = (scale_x + scale_y) / 2.0
+                        avg_scale = (info['scale_x'] + info['scale_y']) / 2.0
                         adjusted_stroke_width = width / avg_scale if avg_scale > 0 else width
                         path = path.stroke(
                             color=color,
@@ -311,14 +479,14 @@ def _render_strokes_with_overrides(
                     g.add(path)
 
             dwg.add(g)
-
         except Exception as e:
             print(f"Error rendering override '{override_char}': {e}")
-            import traceback
-            traceback.print_exc()
 
-    # Return final X position (same as total stroke width since we rendered everything)
-    final_x = cursor_x + total_stroke_width
+        # Track cumulative shift for fallback mode
+        running_shift += override_width + spacing * 2
+
+    # Return final X position
+    final_x = cursor_x + total_shifted_width
     return final_x
 
 
@@ -487,13 +655,29 @@ def _draw(
                 offsets_cp = offsets.copy()
                 offsets_cp[:, :2] *= float(global_scale)
                 ls = drawing.offsets_to_coords(offsets_cp)
-                if denoise:
-                    ls = drawing.denoise(ls)
-                if interpolate_factor > 1:
-                    try:
-                        ls = drawing.interpolate(ls, factor=interpolate_factor)
-                    except Exception:
-                        pass
+
+                # Get char_indices and override_positions for this segment
+                segment_char_indices = segment.get('char_indices', None)
+                segment_override_positions = segment.get('override_positions', [])
+                has_overrides = bool(segment_override_positions)
+
+                # IMPORTANT: Skip denoise/interpolate for segments with overrides
+                # This preserves the 1:1 correspondence between strokes and char_indices
+                # which is critical for precise model-based cutting
+                if has_overrides and segment_char_indices is not None:
+                    print(f"DEBUG preprocess: Skipping denoise/interpolate for override segment to preserve char_indices alignment")
+                    # Don't denoise or interpolate - keep exact correspondence
+                else:
+                    if denoise:
+                        ls = drawing.denoise(ls)
+                    if interpolate_factor > 1:
+                        try:
+                            ls = drawing.interpolate(ls, factor=interpolate_factor)
+                        except Exception:
+                            pass
+                    # Clear char_indices since they no longer align after denoise/interpolate
+                    segment_char_indices = None
+
                 if ls.shape[0] == 0:
                     preprocessed_segments.append({'type': 'empty'})
                     continue
@@ -510,7 +694,7 @@ def _draw(
                 raw_heights.append(raw_h)  # Track for average calculation
 
                 # DEBUG: Log preprocessing values
-                print(f"DEBUG preprocess: text='{segment.get('text', '')[:20]}', raw_h={raw_h:.2f}, s_h={s_h:.4f}, s_w={s_w:.4f}")
+                print(f"DEBUG preprocess: text='{segment.get('text', '')[:20]}', raw_h={raw_h:.2f}, s_h={s_h:.4f}, s_w={s_w:.4f}, has_overrides={has_overrides}")
 
                 preprocessed_segments.append({
                     'type': 'generated',
@@ -519,7 +703,8 @@ def _draw(
                     'color': color,
                     'width': width,
                     'text': segment.get('text', ''),  # Add original text for spacing checks
-                    'override_positions': segment.get('override_positions', [])  # Preserve override positions for placeholder approach
+                    'override_positions': segment_override_positions,  # Preserve override positions
+                    'char_indices': segment_char_indices  # Character indices (preserved for override segments)
                 })
 
         preprocessed_lines.append(preprocessed_segments if preprocessed_segments else [{'empty': True}])
@@ -711,9 +896,13 @@ def _draw(
                 override_positions = segment.get('override_positions', [])
 
                 if override_positions and overrides_dict:
-                    # SPACE PLACEHOLDER APPROACH: Render strokes normally (spaces create gaps),
-                    # then fill those gaps with override SVGs
-                    print(f"DEBUG: Using SPACE PLACEHOLDER rendering for segment with {len(override_positions)} overrides")
+                    # MODEL-LEVEL CHAR INDEX APPROACH: Use char_indices from attention for precise cutting
+                    char_indices = segment.get('char_indices', None)
+                    print(f"DEBUG: Using MODEL-LEVEL CHAR INDEX rendering for segment with {len(override_positions)} overrides")
+                    if char_indices is not None:
+                        print(f"DEBUG: Have char_indices: {len(char_indices)} values")
+                    else:
+                        print(f"DEBUG: No char_indices, will fall back to width estimation")
 
                     ls = segment['strokes'].copy()
                     ls[:, :2] *= s_global
@@ -737,7 +926,8 @@ def _draw(
                         line_scale_x=line_scale_x,
                         color=segment['color'],
                         width=segment['width'],
-                        target_h=segment_height
+                        target_h=segment_height,
+                        char_indices=char_indices  # NEW: Pass char_indices for precise cutting
                     )
                 else:
                     # STANDARD PATH: No overrides, render normally
diff --git a/handwriting_synthesis/hand/operations/sampling.py b/handwriting_synthesis/hand/operations/sampling.py
index aa4f724..95e6d08 100644
--- a/handwriting_synthesis/hand/operations/sampling.py
+++ b/handwriting_synthesis/hand/operations/sampling.py
@@ -12,7 +12,8 @@ def sample_strokes(
     rnn_model,
     lines: List[str],
     biases: Optional[List[float]] = None,
-    styles: Optional[List[int]] = None
+    styles: Optional[List[int]] = None,
+    return_char_indices: bool = False
 ) -> List[np.ndarray]:
     """
     Sample stroke sequences from the RNN model.
@@ -28,10 +29,16 @@ def sample_strokes(
                 consistency of the handwriting. Higher bias -> more legible,
                 less random.
         styles: Optional list of style IDs (one per line).
+        return_char_indices: If True, also return the character indices per stroke
+                             (from the attention mechanism's phi weights).
 
     Returns:
-        List of stroke sequences (numpy arrays of shape [T, 3]).
-        Each stroke point is (x, y, eos).
+        If return_char_indices is False:
+            List of stroke sequences (numpy arrays of shape [T, 3]).
+            Each stroke point is (x, y, eos).
+        If return_char_indices is True:
+            Tuple of (strokes_list, char_indices_list) where char_indices_list
+            contains the character index the model was attending to at each stroke.
     """
     num_samples = len(lines)
     max_tsteps = 40 * max([len(i) for i in lines])
@@ -62,18 +69,37 @@ def sample_strokes(
             chars[i, :len(encoded)] = encoded
             chars_len[i] = len(encoded)
 
-    [samples] = rnn_session.run(
-        [rnn_model.sampled_sequence],
-        feed_dict={
-            rnn_model.prime: styles is not None,
-            rnn_model.x_prime: x_prime,
-            rnn_model.x_prime_len: x_prime_len,
-            rnn_model.num_samples: num_samples,
-            rnn_model.sample_tsteps: max_tsteps,
-            rnn_model.c: chars,
-            rnn_model.c_len: chars_len,
-            rnn_model.bias: biases
-        }
-    )
-    samples = [sample[~np.all(sample == 0.0, axis=1)] for sample in samples]
-    return samples
+    feed_dict = {
+        rnn_model.prime: styles is not None,
+        rnn_model.x_prime: x_prime,
+        rnn_model.x_prime_len: x_prime_len,
+        rnn_model.num_samples: num_samples,
+        rnn_model.sample_tsteps: max_tsteps,
+        rnn_model.c: chars,
+        rnn_model.c_len: chars_len,
+        rnn_model.bias: biases
+    }
+
+    if return_char_indices:
+        # Fetch both stroke samples and character indices from attention
+        samples, char_indices = rnn_session.run(
+            [rnn_model.sampled_sequence, rnn_model.sampled_char_indices],
+            feed_dict=feed_dict
+        )
+        # Remove zero-padded strokes (and corresponding char indices)
+        strokes_list = []
+        char_indices_list = []
+        for sample, ci in zip(samples, char_indices):
+            # Find non-zero strokes
+            valid_mask = ~np.all(sample == 0.0, axis=1)
+            strokes_list.append(sample[valid_mask])
+            char_indices_list.append(ci[valid_mask])
+        return strokes_list, char_indices_list
+    else:
+        # Original behavior: only fetch stroke samples
+        [samples] = rnn_session.run(
+            [rnn_model.sampled_sequence],
+            feed_dict=feed_dict
+        )
+        samples = [sample[~np.all(sample == 0.0, axis=1)] for sample in samples]
+        return samples
diff --git a/handwriting_synthesis/rnn/RNN.py b/handwriting_synthesis/rnn/RNN.py
index 1263bfb..5278aa2 100644
--- a/handwriting_synthesis/rnn/RNN.py
+++ b/handwriting_synthesis/rnn/RNN.py
@@ -58,6 +58,7 @@ def __init__(
         self.initial_state = None
         self.final_state = None
         self.sampled_sequence = None
+        self.sampled_char_indices = None
         self.lstm_size = lstm_size
         self.output_mixture_components = output_mixture_components
         self.output_units = self.output_mixture_components * 6 + 1
@@ -149,20 +150,26 @@ def sample(self, cell):
             cell: The RNN cell to use for sampling.
 
         Returns:
-            Sampled sequence tensor.
+            Tuple of (sampled_sequence, char_indices) where:
+            - sampled_sequence: The stroke outputs
+            - char_indices: Character index per timestep from attention (argmax of phi)
         """
         initial_state = cell.zero_state(self.num_samples, dtype=tf.float32)
         initial_input = tf.concat([
             tf.zeros([self.num_samples, 2]),
             tf.ones([self.num_samples, 1]),
         ], axis=1)
-        return rnn_free_run(
+        states, outputs, final_state = rnn_free_run(
             cell=cell,
             sequence_length=self.sample_tsteps,
             initial_state=initial_state,
             initial_input=initial_input,
             scope='rnn'
-        )[1]
+        )
+        # Extract char_indices from phi: states.phi has shape [batch, timesteps, char_len]
+        # argmax gives us which character the model is attending to at each timestep
+        char_indices = tf.argmax(states.phi, axis=2)  # [batch, timesteps]
+        return outputs, char_indices
 
     def primed_sample(self, cell):
         """
@@ -172,7 +179,9 @@ def primed_sample(self, cell):
             cell: The RNN cell to use for sampling.
 
         Returns:
-            Sampled sequence tensor.
+            Tuple of (sampled_sequence, char_indices) where:
+            - sampled_sequence: The stroke outputs
+            - char_indices: Character index per timestep from attention (argmax of phi)
         """
         initial_state = cell.zero_state(self.num_samples, dtype=tf.float32)
         primed_state = tfcompat.nn.dynamic_rnn(
@@ -183,12 +192,15 @@ def primed_sample(self, cell):
             initial_state=initial_state,
             scope='rnn'
         )[1]
-        return rnn_free_run(
+        states, outputs, final_state = rnn_free_run(
             cell=cell,
             sequence_length=self.sample_tsteps,
             initial_state=primed_state,
             scope='rnn'
-        )[1]
+        )
+        # Extract char_indices from phi: states.phi has shape [batch, timesteps, char_len]
+        char_indices = tf.argmax(states.phi, axis=2)  # [batch, timesteps]
+        return outputs, char_indices
 
     def calculate_loss(self):
         """
@@ -236,9 +248,18 @@ def calculate_loss(self):
         pis, mus, sigmas, rhos, es = self.parse_parameters(params)
         sequence_loss, self.loss = self.nll(self.y, self.x_len, pis, mus, sigmas, rhos, es)
 
+        # Sample returns (outputs, char_indices) - use tf.cond on each
+        primed_outputs, primed_char_indices = self.primed_sample(cell)
+        unprimed_outputs, unprimed_char_indices = self.sample(cell)
+
         self.sampled_sequence = tf.cond(
             self.prime,
-            lambda: self.primed_sample(cell),
-            lambda: self.sample(cell)
+            lambda: primed_outputs,
+            lambda: unprimed_outputs
+        )
+        self.sampled_char_indices = tf.cond(
+            self.prime,
+            lambda: primed_char_indices,
+            lambda: unprimed_char_indices
         )
         return self.loss

From ad80c3b9f86a82cf39f831c8719080b12d12786b Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 21 Jan 2026 11:24:51 +1300
Subject: [PATCH 11/21] Refactor override handling: implement SPACE-PLACEHOLDER
 approach to preserve RNN context, refine chunking, and improve line stitching
 with attention-based character indices.

---
 handwriting_synthesis/hand/Hand.py  | 279 +++++++++++++---------------
 handwriting_synthesis/hand/_draw.py | 127 +++++++------
 2 files changed, 201 insertions(+), 205 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index eaa7133..621499b 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -386,9 +386,14 @@ def write_chunked(
         all_lines = []
         all_line_texts = []
 
-        # If we have overrides, we need to handle text splitting differently
+        # If we have overrides, use SPACE-PLACEHOLDER approach (same as write())
+        # Key insight: Generate full text with SPACES where overrides go.
+        # This preserves RNN context for surrounding text (no separate generation calls).
+        # We then use char_indices from attention to insert overrides at precise positions.
         if overrides_dict:
-            from handwriting_synthesis.hand.character_override_utils import split_text_with_overrides
+            from handwriting_synthesis.hand.character_override_utils import estimate_override_width, get_random_override
+
+            print(f"DEBUG write_chunked: Using SPACE-PLACEHOLDER approach for overrides")
 
             # Track segments for each line (will be used later for line_segments)
             all_line_segment_data = []
@@ -401,51 +406,44 @@ def write_chunked(
                     all_line_segment_data.append([])
                     continue
 
-                # Split line into override and non-override chunks
-                text_chunks = split_text_with_overrides(input_line, overrides_dict)
-
-                # Process each chunk
-                line_segments_data = []
-                texts_to_generate = []
-                chunk_metadata = []
-
-                for chunk_text, is_override in text_chunks:
-                    if is_override:
-                        # Mark as override - will be handled during drawing
-                        line_segments_data.append({
-                            'type': 'override',
-                            'text': chunk_text,
-                            'is_override': True
-                        })
+                # STEP 1: Replace override characters with SPACES
+                # Track override positions within the ORIGINAL line
+                line_override_positions = []  # [(char_idx, char), ...]
+                modified_line_chars = []
+
+                for char_idx, char in enumerate(input_line):
+                    if char in overrides_dict:
+                        line_override_positions.append((char_idx, char))
+                        modified_line_chars.append(' ')  # Space placeholder
+                        print(f"DEBUG: Replacing '{char}' at position {char_idx} with SPACE placeholder")
                     else:
-                        # Non-override text - chunk it and prepare for generation
-                        sub_chunks = split_text_into_chunks(
-                            chunk_text,
-                            words_per_chunk=words_per_chunk,
-                            target_chars_per_chunk=target_chars_per_chunk,
-                            min_words=min_words_per_chunk,
-                            max_words=max_words_per_chunk,
-                            adaptive_chunking=adaptive_chunking,
-                            adaptive_strategy=adaptive_strategy
-                        )
-
-                        for sub_chunk in sub_chunks:
-                            gen_idx = len(texts_to_generate)
-                            texts_to_generate.append(sub_chunk)
-                            chunk_metadata.append({
-                                'gen_idx': gen_idx,
-                                'text': sub_chunk
-                            })
-                            line_segments_data.append({
-                                'type': 'generated',
-                                'text': sub_chunk,
-                                'gen_idx': gen_idx,
-                                'is_override': False
-                            })
-
-                # Validate characters in texts to generate
+                        modified_line_chars.append(char)
+
+                modified_line = ''.join(modified_line_chars)
+                print(f"DEBUG: Original line: '{input_line}'")
+                print(f"DEBUG: Modified line: '{modified_line}'")
+                print(f"DEBUG: Override positions: {line_override_positions}")
+
+                # STEP 2: Chunk the MODIFIED text (with spaces) normally
+                chunks = split_text_into_chunks(
+                    modified_line,
+                    words_per_chunk=words_per_chunk,
+                    target_chars_per_chunk=target_chars_per_chunk,
+                    min_words=min_words_per_chunk,
+                    max_words=max_words_per_chunk,
+                    adaptive_chunking=adaptive_chunking,
+                    adaptive_strategy=adaptive_strategy
+                )
+
+                if not chunks:
+                    all_lines.append(np.empty((0, 3)))
+                    all_line_texts.append('')
+                    all_line_segment_data.append([])
+                    continue
+
+                # Validate characters in chunks (spaces are valid, override chars replaced)
                 valid_char_set = set(drawing.alphabet)
-                for chunk_num, chunk in enumerate(texts_to_generate):
+                for chunk_num, chunk in enumerate(chunks):
                     for char in chunk:
                         if char not in valid_char_set:
                             raise ValueError(
@@ -453,120 +451,111 @@ def write_chunked(
                                 f"Valid character set is {valid_char_set}"
                             )
 
-                # Generate strokes for non-override chunks only
-                if texts_to_generate:
-                    chunk_strokes = self._sample(
-                        texts_to_generate,
-                        biases=[biases] * len(texts_to_generate) if biases is not None else None,
-                        styles=[styles] * len(texts_to_generate) if styles is not None else None
-                    )
+                # STEP 3: Generate strokes for all chunks WITH char_indices
+                # This preserves full RNN context across the entire modified text
+                chunk_strokes, chunk_char_indices = self._sample(
+                    chunks,
+                    biases=[biases] * len(chunks) if biases is not None else None,
+                    styles=[styles] * len(chunks) if styles is not None else None,
+                    return_char_indices=True  # Get char indices from attention
+                )
+
+                print(f"DEBUG: Generated {len(chunks)} chunks with char_indices")
+
+                # STEP 4: Map override positions to chunks
+                # Track which character position each chunk starts at in the original line
+                chunk_start_positions = []
+                current_pos = 0
+                for chunk in chunks:
+                    chunk_start_positions.append(current_pos)
+                    current_pos += len(chunk)
 
-                    # Map generated strokes back to segments
-                    for segment in line_segments_data:
-                        if segment['type'] == 'generated':
-                            segment['strokes'] = chunk_strokes[segment['gen_idx']]
-                else:
-                    chunk_strokes = []
+                print(f"DEBUG: Chunk start positions: {chunk_start_positions}")
 
-                # Now stitch the generated chunks together, handling overrides
+                # STEP 5: Build segment data with override info for each chunk
+                # Stitch chunks into lines based on actual widths
                 current_line_stroke = np.empty((0, 3))
                 current_line_text = []
                 current_line_width = 0.0
                 current_line_segment_list = []
 
-                for seg_idx, segment in enumerate(line_segments_data):
-                    if segment['type'] == 'override':
-                        # Estimate override width for layout
-                        from handwriting_synthesis.hand.character_override_utils import get_random_override, estimate_override_width
-                        override_data = get_random_override(overrides_dict, segment['text'])
+                for chunk_idx, (chunk_text, chunk_stroke, char_indices) in enumerate(
+                    zip(chunks, chunk_strokes, chunk_char_indices)
+                ):
+                    chunk_start = chunk_start_positions[chunk_idx]
+                    chunk_end = chunk_start + len(chunk_text)
+
+                    # Find override positions that fall within this chunk
+                    chunk_override_positions = []
+                    for orig_char_idx, override_char in line_override_positions:
+                        if chunk_start <= orig_char_idx < chunk_end:
+                            # Convert to chunk-local index
+                            local_idx = orig_char_idx - chunk_start
+                            chunk_override_positions.append((local_idx, override_char))
+
+                    has_overrides = len(chunk_override_positions) > 0
+                    print(f"DEBUG: Chunk {chunk_idx} '{chunk_text}': has_overrides={has_overrides}, positions={chunk_override_positions}")
+
+                    # Calculate chunk width (including estimated override widths)
+                    chunk_width = get_stroke_width(chunk_stroke)
+
+                    # For width calculation, estimate how much extra space overrides need
+                    # (the actual rendering will shift strokes, but we need to estimate for line breaking)
+                    extra_override_width = 0.0
+                    for local_idx, override_char in chunk_override_positions:
+                        override_data = get_random_override(overrides_dict, override_char)
                         if override_data:
-                            # Estimate width (using typical line height of 60px)
-                            override_width = estimate_override_width(override_data, target_height=60, x_stretch=1.0)
-                        else:
-                            override_width = 20  # fallback width
-
-                        # FIXED: Check for adjacent spaces and apply appropriate spacing
-                        # This matches the logic in _draw.py for consistent line breaking
-                        has_space_before = False
-                        if seg_idx > 0:
-                            prev_seg = line_segments_data[seg_idx - 1]
-                            if prev_seg.get('type') == 'generated':
-                                prev_text = prev_seg.get('text', '')
-                                has_space_before = prev_text.strip() == '' or prev_text.endswith(' ')
-
-                        has_space_after = False
-                        if seg_idx < len(line_segments_data) - 1:
-                            next_seg = line_segments_data[seg_idx + 1]
-                            if next_seg.get('type') == 'generated':
-                                next_text = next_seg.get('text', '')
-                                has_space_after = next_text.strip() == '' or next_text.startswith(' ')
-
-                        # When there's a space adjacent, use space-width spacing
-                        # When there's no space, use minimal character spacing
-                        space_width = override_width * 0.35
-                        spacing_before = space_width if has_space_before else override_width * 0.15
-                        spacing_after = space_width if has_space_after else override_width * 0.15
-                        override_width_with_spacing = spacing_before + override_width + spacing_after
-
-                        potential_width = current_line_width
-                        if current_line_width > 0:
-                            potential_width += override_width_with_spacing
-                        else:
-                            potential_width = override_width_with_spacing
+                            # Space creates minimal width, override needs actual width
+                            override_w = estimate_override_width(override_data, target_height=60, x_stretch=1.0)
+                            # Add the difference (override width minus space width, plus some spacing)
+                            extra_override_width += override_w + (override_w * 0.3)
 
-                        if potential_width <= max_line_width or current_line_width == 0:
-                            # Fits on current line
-                            current_line_text.append(segment['text'])
-                            current_line_segment_list.append(segment)
-                            current_line_width = potential_width
-                        else:
-                            # Start new line
-                            if len(current_line_stroke) > 0 or len(current_line_text) > 0:
-                                all_lines.append(current_line_stroke)
-                                all_line_texts.append(''.join(current_line_text))
-                                all_line_segment_data.append(current_line_segment_list)
-
-                            current_line_stroke = np.empty((0, 3))
-                            current_line_text = [segment['text']]
-                            current_line_segment_list = [segment]
-                            current_line_width = override_width_with_spacing
+                    effective_chunk_width = chunk_width + extra_override_width
+
+                    # Check if chunk fits on current line
+                    potential_width = current_line_width
+                    if current_line_width > 0:
+                        potential_width += chunk_spacing + effective_chunk_width
                     else:
-                        # Generated chunk
-                        chunk_stroke = segment['strokes']
-                        chunk_width = get_stroke_width(chunk_stroke)
+                        potential_width = effective_chunk_width
+
+                    # Build segment data
+                    segment = {
+                        'type': 'generated',
+                        'text': input_line[chunk_start:chunk_end],  # Original text (with override chars)
+                        'modified_text': chunk_text,  # Text that was generated (with spaces)
+                        'strokes': chunk_stroke,
+                        'char_indices': char_indices,  # Attention-based character indices
+                        'override_positions': chunk_override_positions,  # [(local_idx, char), ...]
+                        'chunk_start': chunk_start,
+                        'chunk_end': chunk_end,
+                    }
 
-                        # Check if chunk fits on current line
-                        potential_width = current_line_width
+                    if potential_width <= max_line_width or current_line_width == 0:
+                        # Chunk fits on current line
                         if current_line_width > 0:
-                            potential_width += chunk_spacing + chunk_width
-                        else:
-                            potential_width = chunk_width
-
-                        if potential_width <= max_line_width or current_line_width == 0:
-                            # Chunk fits on current line
-                            if current_line_width > 0:
-                                current_line_stroke = stitch_strokes(
-                                    current_line_stroke,
-                                    chunk_stroke,
-                                    chunk_spacing,
-                                    rotate_to_match=rotate_chunks
-                                )
-                            else:
-                                current_line_stroke = chunk_stroke
-                            current_line_text.append(segment['text'])
-                            current_line_segment_list.append(segment)
-                            current_line_width = potential_width
+                            current_line_stroke = stitch_strokes(
+                                current_line_stroke,
+                                chunk_stroke,
+                                chunk_spacing,
+                                rotate_to_match=rotate_chunks
+                            )
                         else:
-                            # Start new line (width exceeded)
-                            if len(current_line_stroke) > 0 or len(current_line_text) > 0:
-                                all_lines.append(current_line_stroke)
-                                all_line_texts.append(''.join(current_line_text))
-                                all_line_segment_data.append(current_line_segment_list)
-
                             current_line_stroke = chunk_stroke
-                            current_line_text = [segment['text']]
-                            current_line_segment_list = [segment]
-                            current_line_width = chunk_width
+                        current_line_text.append(input_line[chunk_start:chunk_end])
+                        current_line_segment_list.append(segment)
+                        current_line_width = potential_width
+                    else:
+                        # Start new line (width exceeded)
+                        if len(current_line_stroke) > 0 or len(current_line_text) > 0:
+                            all_lines.append(current_line_stroke)
+                            all_line_texts.append(''.join(current_line_text))
+                            all_line_segment_data.append(current_line_segment_list)
+
+                        current_line_stroke = chunk_stroke
+                        current_line_text = [input_line[chunk_start:chunk_end]]
+                        current_line_segment_list = [segment]
+                        current_line_width = effective_chunk_width
 
                 # Add last line from this input line
                 if len(current_line_stroke) > 0 or len(current_line_text) > 0:
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index c1761af..1da5aad 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -261,56 +261,56 @@ def _render_strokes_with_overrides(
                     end_idx = matching_strokes[-1]
                     stroke_range = (start_idx, end_idx)
 
-                    # COMBINED BUFFER APPROACH:
-                    # 1. Stroke index based buffer
-                    # 2. X-position based exclusion zone
-                    # 3. Pen-up (eos) based extension
-
-                    # Calculate buffer size based on average strokes per character
+                    # AGGRESSIVE EXCLUSION APPROACH:
+                    # The artifacts are "trailing strokes" of the previous character -
+                    # the connecting/cursive tail that extends toward the space.
+                    # We need to exclude:
+                    # 1. All space strokes (char_idx == space_idx)
+                    # 2. The LAST N strokes of the previous character (the tail)
+                    # 3. The FIRST N strokes of the next character (any leading artifacts)
+
+                    # Calculate how many strokes to exclude from adjacent characters
                     total_strokes = ls.shape[0]
                     avg_strokes_per_char = total_strokes / max(1, num_chars)
-                    # Buffer: ~50% of average character's strokes on each side (more aggressive)
-                    stroke_buffer = int(max(5, avg_strokes_per_char * 0.5))
-
-                    # Expand range by stroke buffer
-                    expanded_start = max(0, start_idx - stroke_buffer)
-                    expanded_end = min(ls.shape[0] - 1, end_idx + stroke_buffer)
-
-                    # Also create an X-position exclusion zone
-                    # Find the X range of the space strokes
-                    space_x_coords = ls[start_idx:end_idx+1, 0]
-                    space_x_min = space_x_coords.min() if len(space_x_coords) > 0 else ls[start_idx, 0]
-                    space_x_max = space_x_coords.max() if len(space_x_coords) > 0 else ls[start_idx, 0]
-
-                    # Expand X zone by 0.5 * avg_char_width on each side
-                    x_buffer = avg_char_width * 0.5
-                    exclusion_x_min = space_x_min - x_buffer
-                    exclusion_x_max = space_x_max + x_buffer
-
-                    # Now expand stroke range to include ANY stroke with X in the exclusion zone
-                    # Search backward from expanded_start
-                    while expanded_start > 0:
-                        prev_x = ls[expanded_start - 1, 0]
-                        prev_eos = ls[expanded_start - 1, 2]
-                        # Include if X is in zone OR if it's a pen-up transition
-                        if exclusion_x_min <= prev_x <= exclusion_x_max or prev_eos > 0.5:
-                            expanded_start -= 1
-                        else:
-                            break
-
-                    # Search forward from expanded_end
-                    while expanded_end < ls.shape[0] - 1:
-                        next_x = ls[expanded_end + 1, 0]
-                        next_eos = ls[expanded_end, 2]  # Current stroke's eos indicates break after
-                        # Include if X is in zone OR if current is pen-up
-                        if exclusion_x_min <= next_x <= exclusion_x_max or next_eos > 0.5:
-                            expanded_end += 1
-                        else:
-                            break
+                    # Tail buffer: ~40% of average character's strokes
+                    tail_buffer = int(max(8, avg_strokes_per_char * 0.4))
+
+                    # Start with the space strokes
+                    expanded_start = start_idx
+                    expanded_end = end_idx
+
+                    # Find previous character's strokes and exclude its tail
+                    prev_char_idx = char_idx - 1
+                    if prev_char_idx >= 0:
+                        prev_char_strokes = np.where(char_indices == prev_char_idx)[0]
+                        if len(prev_char_strokes) > 0:
+                            # Exclude the last tail_buffer strokes of the previous character
+                            prev_char_end_stroke = prev_char_strokes[-1]
+                            prev_char_tail_start = max(prev_char_strokes[0], prev_char_end_stroke - tail_buffer + 1)
+                            expanded_start = min(expanded_start, prev_char_tail_start)
+                            print(f"DEBUG: Excluding tail of prev char (idx {prev_char_idx}): strokes [{prev_char_tail_start}, {prev_char_end_stroke}]")
+
+                    # Find next character's strokes and exclude its leading strokes
+                    next_char_idx = char_idx + 1
+                    if next_char_idx <= char_indices.max():
+                        next_char_strokes = np.where(char_indices == next_char_idx)[0]
+                        if len(next_char_strokes) > 0:
+                            # Exclude the first few strokes of the next character (smaller buffer)
+                            next_char_start_stroke = next_char_strokes[0]
+                            lead_buffer = tail_buffer // 2  # Smaller buffer for leading strokes
+                            next_char_lead_end = min(next_char_strokes[-1], next_char_start_stroke + lead_buffer - 1)
+                            expanded_end = max(expanded_end, next_char_lead_end)
+                            print(f"DEBUG: Excluding lead of next char (idx {next_char_idx}): strokes [{next_char_start_stroke}, {next_char_lead_end}]")
+
+                    # Also extend to catch any pen-up transitions at the boundaries
+                    while expanded_start > 0 and ls[expanded_start - 1, 2] > 0.5:
+                        expanded_start -= 1
+                    while expanded_end < ls.shape[0] - 1 and ls[expanded_end, 2] > 0.5:
+                        expanded_end += 1
 
                     exclusion_range = (expanded_start, expanded_end)
                     num_excluded = expanded_end - expanded_start + 1
-                    print(f"DEBUG: Expanded exclusion range from [{start_idx}, {end_idx}] to [{expanded_start}, {expanded_end}] ({num_excluded} strokes, buffer={stroke_buffer})")
+                    print(f"DEBUG: Final exclusion range [{expanded_start}, {expanded_end}] ({num_excluded} strokes, tail_buffer={tail_buffer})")
 
                     insertion_x = ls[start_idx, 0]
                 else:
@@ -930,9 +930,15 @@ def _draw(
                         char_indices=char_indices  # NEW: Pass char_indices for precise cutting
                     )
                 else:
-                    # STANDARD PATH: No overrides, render normally
+                    # STANDARD PATH: No overrides in this segment, render normally
                     ls = segment['strokes'].copy()
                     raw_h_before_scale = ls[:, 1].max()
+
+                    # NOTE: With the space-placeholder approach, we no longer need aggressive
+                    # clipping for segments adjacent to overrides. Text is generated as a
+                    # continuous sequence with spaces where overrides go, and char_indices
+                    # from attention give us precise cutting positions.
+
                     ls[:, :2] *= s_global
                     if x_stretch != 1.0:
                         ls[:, 0] *= x_stretch
@@ -942,8 +948,8 @@ def _draw(
                         ls[:, 0] *= line_scale_x
 
                     # Track segment width before translating
-                    segment_width = ls[:, 0].max()
-                    segment_height = ls[:, 1].max()
+                    segment_width = ls[:, 0].max() if ls.shape[0] > 0 else 0
+                    segment_height = ls[:, 1].max() if ls.shape[0] > 0 else 0
 
                     # Add inter-segment spacing
                     prev_seg = preprocessed_segments[seg_idx - 1] if seg_idx > 0 else None
@@ -953,18 +959,19 @@ def _draw(
                     # DEBUG: Log generated segment dimensions
                     print(f"DEBUG generated: text='{segment.get('text', '')[:20]}', raw_h={raw_h_before_scale:.2f}, final_h={segment_height:.2f}")
 
-                    ls[:, 0] += cursor_x
-                    ls[:, 1] += line_offset_y
-
-                    prev_eos = 1.0
-                    commands = []
-                    for x, y, eos in zip(*ls.T):
-                        commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
-                        prev_eos = eos
-                    p = ' '.join(commands)
-                    path = svgwrite.path.Path(p)
-                    path = path.stroke(color=segment['color'], width=segment['width'], linecap='round', linejoin='round', miterlimit=2).fill('none')
-                    dwg.add(path)
+                    if ls.shape[0] > 0:
+                        ls[:, 0] += cursor_x
+                        ls[:, 1] += line_offset_y
+
+                        prev_eos = 1.0
+                        commands = []
+                        for x, y, eos in zip(*ls.T):
+                            commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
+                            prev_eos = eos
+                        p = ' '.join(commands)
+                        path = svgwrite.path.Path(p)
+                        path = path.stroke(color=segment['color'], width=segment['width'], linecap='round', linejoin='round', miterlimit=2).fill('none')
+                        dwg.add(path)
 
                     # Advance cursor by segment width
                     cursor_x += segment_width

From a741d063c25201f9f015e8ea5372e7a1e7c59ca2 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 21 Jan 2026 11:36:03 +1300
Subject: [PATCH 12/21] Refactor character override parsing: improve SVG path
 handling with bezier curve support, simplify exclusion logic, and enhance
 debug logs.

---
 handwriting_synthesis/hand/_draw.py           | 108 ++++++++++--------
 .../hand/character_override_utils.py          |  16 ++-
 2 files changed, 74 insertions(+), 50 deletions(-)

diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 1da5aad..2e013db 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -17,6 +17,52 @@
 }
 
 
+def _extract_svg_coordinates(d_string):
+    """
+    Extract all coordinate points from an SVG path 'd' attribute.
+
+    Handles M, L, C, Q, A commands (absolute and relative) to properly
+    calculate bounding boxes for characters with curves (like '!' dot).
+
+    Args:
+        d_string: The 'd' attribute value from an SVG path element.
+
+    Returns:
+        List of (x, y) tuples representing all coordinate points.
+    """
+    coords = []
+
+    # M/L: x y (move/line commands)
+    for match in re.finditer(r'[MLml]\s*([-\d.]+)[,\s]+([-\d.]+)', d_string):
+        coords.append((float(match.group(1)), float(match.group(2))))
+
+    # C (cubic bezier): x1 y1, x2 y2, x y - capture all 3 points for bounding box
+    for match in re.finditer(r'[Cc]\s*([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)', d_string):
+        coords.append((float(match.group(1)), float(match.group(2))))  # control point 1
+        coords.append((float(match.group(3)), float(match.group(4))))  # control point 2
+        coords.append((float(match.group(5)), float(match.group(6))))  # endpoint
+
+    # Q (quadratic bezier): x1 y1, x y - capture both points
+    for match in re.finditer(r'[Qq]\s*([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)', d_string):
+        coords.append((float(match.group(1)), float(match.group(2))))  # control point
+        coords.append((float(match.group(3)), float(match.group(4))))  # endpoint
+
+    # S (smooth cubic): x2 y2, x y - capture both points
+    for match in re.finditer(r'[Ss]\s*([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)', d_string):
+        coords.append((float(match.group(1)), float(match.group(2))))
+        coords.append((float(match.group(3)), float(match.group(4))))
+
+    # T (smooth quadratic): x y
+    for match in re.finditer(r'[Tt]\s*([-\d.]+)[,\s]+([-\d.]+)', d_string):
+        coords.append((float(match.group(1)), float(match.group(2))))
+
+    # A (arc): rx ry angle large-arc sweep x y - capture endpoint
+    for match in re.finditer(r'[Aa]\s*[-\d.]+[,\s]+[-\d.]+[,\s]+[-\d.]+[,\s]+[01][,\s]+[01][,\s]+([-\d.]+)[,\s]+([-\d.]+)', d_string):
+        coords.append((float(match.group(1)), float(match.group(2))))
+
+    return coords
+
+
 def _to_px(value, units):
     """
     Converts a value to pixels based on the given unit.
@@ -222,10 +268,11 @@ def _render_strokes_with_overrides(
                 tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
                 if tag_name == 'path':
                     d = elem.get('d', '')
-                    coords = re.findall(r'[ML]\s*([-\d.]+)\s+([-\d.]+)', d)
+                    # Use comprehensive SVG parsing to capture bezier curves (e.g., for '!' dot)
+                    coords = _extract_svg_coordinates(d)
                     for x, y in coords:
-                        all_x_coords.append(float(x))
-                        all_y_coords.append(float(y))
+                        all_x_coords.append(x)
+                        all_y_coords.append(y)
 
             if not all_x_coords or not all_y_coords:
                 print(f"Warning: No coordinates found for override '{override_char}'")
@@ -261,48 +308,14 @@ def _render_strokes_with_overrides(
                     end_idx = matching_strokes[-1]
                     stroke_range = (start_idx, end_idx)
 
-                    # AGGRESSIVE EXCLUSION APPROACH:
-                    # The artifacts are "trailing strokes" of the previous character -
-                    # the connecting/cursive tail that extends toward the space.
-                    # We need to exclude:
-                    # 1. All space strokes (char_idx == space_idx)
-                    # 2. The LAST N strokes of the previous character (the tail)
-                    # 3. The FIRST N strokes of the next character (any leading artifacts)
-
-                    # Calculate how many strokes to exclude from adjacent characters
-                    total_strokes = ls.shape[0]
-                    avg_strokes_per_char = total_strokes / max(1, num_chars)
-                    # Tail buffer: ~40% of average character's strokes
-                    tail_buffer = int(max(8, avg_strokes_per_char * 0.4))
-
-                    # Start with the space strokes
+                    # SIMPLIFIED EXCLUSION: Only exclude the space placeholder strokes
+                    # Don't aggressively cut into adjacent characters - this was causing
+                    # visible artifacts by removing actual character strokes.
                     expanded_start = start_idx
                     expanded_end = end_idx
 
-                    # Find previous character's strokes and exclude its tail
-                    prev_char_idx = char_idx - 1
-                    if prev_char_idx >= 0:
-                        prev_char_strokes = np.where(char_indices == prev_char_idx)[0]
-                        if len(prev_char_strokes) > 0:
-                            # Exclude the last tail_buffer strokes of the previous character
-                            prev_char_end_stroke = prev_char_strokes[-1]
-                            prev_char_tail_start = max(prev_char_strokes[0], prev_char_end_stroke - tail_buffer + 1)
-                            expanded_start = min(expanded_start, prev_char_tail_start)
-                            print(f"DEBUG: Excluding tail of prev char (idx {prev_char_idx}): strokes [{prev_char_tail_start}, {prev_char_end_stroke}]")
-
-                    # Find next character's strokes and exclude its leading strokes
-                    next_char_idx = char_idx + 1
-                    if next_char_idx <= char_indices.max():
-                        next_char_strokes = np.where(char_indices == next_char_idx)[0]
-                        if len(next_char_strokes) > 0:
-                            # Exclude the first few strokes of the next character (smaller buffer)
-                            next_char_start_stroke = next_char_strokes[0]
-                            lead_buffer = tail_buffer // 2  # Smaller buffer for leading strokes
-                            next_char_lead_end = min(next_char_strokes[-1], next_char_start_stroke + lead_buffer - 1)
-                            expanded_end = max(expanded_end, next_char_lead_end)
-                            print(f"DEBUG: Excluding lead of next char (idx {next_char_idx}): strokes [{next_char_start_stroke}, {next_char_lead_end}]")
-
-                    # Also extend to catch any pen-up transitions at the boundaries
+                    # Only extend to include pen-up transitions at boundaries
+                    # This catches connecting strokes that are part of the transition
                     while expanded_start > 0 and ls[expanded_start - 1, 2] > 0.5:
                         expanded_start -= 1
                     while expanded_end < ls.shape[0] - 1 and ls[expanded_end, 2] > 0.5:
@@ -310,10 +323,12 @@ def _render_strokes_with_overrides(
 
                     exclusion_range = (expanded_start, expanded_end)
                     num_excluded = expanded_end - expanded_start + 1
-                    print(f"DEBUG: Final exclusion range [{expanded_start}, {expanded_end}] ({num_excluded} strokes, tail_buffer={tail_buffer})")
+                    print(f"DEBUG: Exclusion for '{override_char}' (char_idx={char_idx}): "
+                          f"strokes [{expanded_start}, {expanded_end}] ({num_excluded} strokes)")
 
                     insertion_x = ls[start_idx, 0]
                 else:
+                    print(f"DEBUG: No matching strokes for char_idx={char_idx}, falling back to position estimate")
                     insertion_x = stroke_min_x + (char_idx * avg_char_width)
                     exclusion_range = None
             else:
@@ -989,10 +1004,11 @@ def _draw(
                         tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
                         if tag_name == 'path':
                             d = elem.get('d', '')
-                            coords = re.findall(r'[ML]\s*([-\d.]+)\s+([-\d.]+)', d)
+                            # Use comprehensive SVG parsing to capture bezier curves (e.g., for '!' dot)
+                            coords = _extract_svg_coordinates(d)
                             for x, y in coords:
-                                all_x_coords.append(float(x))
-                                all_y_coords.append(float(y))
+                                all_x_coords.append(x)
+                                all_y_coords.append(y)
 
                     if not all_x_coords or not all_y_coords:
                         print(f"Warning: No coordinates found for override '{segment.get('char', '?')}'")
diff --git a/handwriting_synthesis/hand/character_override_utils.py b/handwriting_synthesis/hand/character_override_utils.py
index d73b062..605c29d 100644
--- a/handwriting_synthesis/hand/character_override_utils.py
+++ b/handwriting_synthesis/hand/character_override_utils.py
@@ -269,10 +269,18 @@ def estimate_override_width(override_data, target_height, x_stretch=1.0):
             tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
             if tag_name == 'path':
                 d = elem.get('d', '')
-                coords = re.findall(r'[ML]\s*([-\d.]+)\s+([-\d.]+)', d)
-                for x, y in coords:
-                    all_x_coords.append(float(x))
-                    all_y_coords.append(float(y))
+                # Extract M/L coordinates
+                for match in re.finditer(r'[MLml]\s*([-\d.]+)[,\s]+([-\d.]+)', d):
+                    all_x_coords.append(float(match.group(1)))
+                    all_y_coords.append(float(match.group(2)))
+                # Extract C (cubic bezier) control and end points for bounding box
+                for match in re.finditer(r'[Cc]\s*([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)', d):
+                    all_x_coords.extend([float(match.group(1)), float(match.group(3)), float(match.group(5))])
+                    all_y_coords.extend([float(match.group(2)), float(match.group(4)), float(match.group(6))])
+                # Extract Q (quadratic bezier) points
+                for match in re.finditer(r'[Qq]\s*([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)[,\s]+([-\d.]+)', d):
+                    all_x_coords.extend([float(match.group(1)), float(match.group(3))])
+                    all_y_coords.extend([float(match.group(2)), float(match.group(4))])
 
         if all_x_coords and all_y_coords:
             char_width = max(all_x_coords) - min(all_x_coords)

From 70169cf2501c694cbc31d97749393c241580fd16 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 21 Jan 2026 12:45:31 +1300
Subject: [PATCH 13/21] Refactor override handling: preserve chunk-original
 mappings, improve override position tracking, and refine stroke generation
 logic.

---
 handwriting_synthesis/hand/Hand.py | 124 +++++++++++++----------------
 1 file changed, 54 insertions(+), 70 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 621499b..85bc186 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -386,10 +386,9 @@ def write_chunked(
         all_lines = []
         all_line_texts = []
 
-        # If we have overrides, use SPACE-PLACEHOLDER approach (same as write())
-        # Key insight: Generate full text with SPACES where overrides go.
-        # This preserves RNN context for surrounding text (no separate generation calls).
-        # We then use char_indices from attention to insert overrides at precise positions.
+        # If we have overrides, use SPACE-PLACEHOLDER approach
+        # KEY FIX: Chunk the ORIGINAL text first, THEN replace override chars in each chunk.
+        # This preserves the position mapping between chunks and the original text.
         if overrides_dict:
             from handwriting_synthesis.hand.character_override_utils import estimate_override_width, get_random_override
 
@@ -406,27 +405,10 @@ def write_chunked(
                     all_line_segment_data.append([])
                     continue
 
-                # STEP 1: Replace override characters with SPACES
-                # Track override positions within the ORIGINAL line
-                line_override_positions = []  # [(char_idx, char), ...]
-                modified_line_chars = []
-
-                for char_idx, char in enumerate(input_line):
-                    if char in overrides_dict:
-                        line_override_positions.append((char_idx, char))
-                        modified_line_chars.append(' ')  # Space placeholder
-                        print(f"DEBUG: Replacing '{char}' at position {char_idx} with SPACE placeholder")
-                    else:
-                        modified_line_chars.append(char)
-
-                modified_line = ''.join(modified_line_chars)
-                print(f"DEBUG: Original line: '{input_line}'")
-                print(f"DEBUG: Modified line: '{modified_line}'")
-                print(f"DEBUG: Override positions: {line_override_positions}")
-
-                # STEP 2: Chunk the MODIFIED text (with spaces) normally
-                chunks = split_text_into_chunks(
-                    modified_line,
+                # STEP 1: Chunk the ORIGINAL text first (before any modification)
+                # This preserves word boundaries and spacing correctly
+                original_chunks = split_text_into_chunks(
+                    input_line,
                     words_per_chunk=words_per_chunk,
                     target_chars_per_chunk=target_chars_per_chunk,
                     min_words=min_words_per_chunk,
@@ -435,15 +417,40 @@ def write_chunked(
                     adaptive_strategy=adaptive_strategy
                 )
 
-                if not chunks:
+                if not original_chunks:
                     all_lines.append(np.empty((0, 3)))
                     all_line_texts.append('')
                     all_line_segment_data.append([])
                     continue
 
-                # Validate characters in chunks (spaces are valid, override chars replaced)
+                print(f"DEBUG: Original line: '{input_line}'")
+                print(f"DEBUG: Original chunks: {original_chunks}")
+
+                # STEP 2: For each chunk, identify overrides and create modified version
+                modified_chunks = []  # Chunks with override chars replaced by spaces
+                chunk_override_info = []  # Override positions for each chunk
+
+                for chunk_idx, original_chunk in enumerate(original_chunks):
+                    chunk_overrides = []  # [(local_idx, char), ...]
+                    modified_chars = []
+
+                    for char_idx, char in enumerate(original_chunk):
+                        if char in overrides_dict:
+                            chunk_overrides.append((char_idx, char))
+                            modified_chars.append(' ')  # Space placeholder
+                            print(f"DEBUG: Chunk {chunk_idx}: Replacing '{char}' at local position {char_idx} with SPACE")
+                        else:
+                            modified_chars.append(char)
+
+                    modified_chunk = ''.join(modified_chars)
+                    modified_chunks.append(modified_chunk)
+                    chunk_override_info.append(chunk_overrides)
+
+                    print(f"DEBUG: Chunk {chunk_idx}: original='{original_chunk}' modified='{modified_chunk}' overrides={chunk_overrides}")
+
+                # STEP 3: Validate modified chunks (should only contain valid alphabet chars)
                 valid_char_set = set(drawing.alphabet)
-                for chunk_num, chunk in enumerate(chunks):
+                for chunk_num, chunk in enumerate(modified_chunks):
                     for char in chunk:
                         if char not in valid_char_set:
                             raise ValueError(
@@ -451,26 +458,15 @@ def write_chunked(
                                 f"Valid character set is {valid_char_set}"
                             )
 
-                # STEP 3: Generate strokes for all chunks WITH char_indices
-                # This preserves full RNN context across the entire modified text
+                # STEP 4: Generate strokes for modified chunks WITH char_indices
                 chunk_strokes, chunk_char_indices = self._sample(
-                    chunks,
-                    biases=[biases] * len(chunks) if biases is not None else None,
-                    styles=[styles] * len(chunks) if styles is not None else None,
+                    modified_chunks,
+                    biases=[biases] * len(modified_chunks) if biases is not None else None,
+                    styles=[styles] * len(modified_chunks) if styles is not None else None,
                     return_char_indices=True  # Get char indices from attention
                 )
 
-                print(f"DEBUG: Generated {len(chunks)} chunks with char_indices")
-
-                # STEP 4: Map override positions to chunks
-                # Track which character position each chunk starts at in the original line
-                chunk_start_positions = []
-                current_pos = 0
-                for chunk in chunks:
-                    chunk_start_positions.append(current_pos)
-                    current_pos += len(chunk)
-
-                print(f"DEBUG: Chunk start positions: {chunk_start_positions}")
+                print(f"DEBUG: Generated {len(modified_chunks)} chunks with char_indices")
 
                 # STEP 5: Build segment data with override info for each chunk
                 # Stitch chunks into lines based on actual widths
@@ -479,35 +475,23 @@ def write_chunked(
                 current_line_width = 0.0
                 current_line_segment_list = []
 
-                for chunk_idx, (chunk_text, chunk_stroke, char_indices) in enumerate(
-                    zip(chunks, chunk_strokes, chunk_char_indices)
+                for chunk_idx, (original_chunk, modified_chunk, chunk_stroke, char_indices, chunk_overrides) in enumerate(
+                    zip(original_chunks, modified_chunks, chunk_strokes, chunk_char_indices, chunk_override_info)
                 ):
-                    chunk_start = chunk_start_positions[chunk_idx]
-                    chunk_end = chunk_start + len(chunk_text)
-
-                    # Find override positions that fall within this chunk
-                    chunk_override_positions = []
-                    for orig_char_idx, override_char in line_override_positions:
-                        if chunk_start <= orig_char_idx < chunk_end:
-                            # Convert to chunk-local index
-                            local_idx = orig_char_idx - chunk_start
-                            chunk_override_positions.append((local_idx, override_char))
-
-                    has_overrides = len(chunk_override_positions) > 0
-                    print(f"DEBUG: Chunk {chunk_idx} '{chunk_text}': has_overrides={has_overrides}, positions={chunk_override_positions}")
+                    has_overrides = len(chunk_overrides) > 0
+                    print(f"DEBUG: Processing chunk {chunk_idx} '{modified_chunk}': has_overrides={has_overrides}, positions={chunk_overrides}")
+                    if has_overrides:
+                        print(f"DEBUG:   char_indices range: [{char_indices.min()}, {char_indices.max()}], len={len(char_indices)}")
 
                     # Calculate chunk width (including estimated override widths)
                     chunk_width = get_stroke_width(chunk_stroke)
 
                     # For width calculation, estimate how much extra space overrides need
-                    # (the actual rendering will shift strokes, but we need to estimate for line breaking)
                     extra_override_width = 0.0
-                    for local_idx, override_char in chunk_override_positions:
+                    for local_idx, override_char in chunk_overrides:
                         override_data = get_random_override(overrides_dict, override_char)
                         if override_data:
-                            # Space creates minimal width, override needs actual width
                             override_w = estimate_override_width(override_data, target_height=60, x_stretch=1.0)
-                            # Add the difference (override width minus space width, plus some spacing)
                             extra_override_width += override_w + (override_w * 0.3)
 
                     effective_chunk_width = chunk_width + extra_override_width
@@ -520,15 +504,15 @@ def write_chunked(
                         potential_width = effective_chunk_width
 
                     # Build segment data
+                    # NOTE: 'text' is the MODIFIED chunk (what was generated)
+                    # override_positions are LOCAL indices within this chunk
                     segment = {
                         'type': 'generated',
-                        'text': input_line[chunk_start:chunk_end],  # Original text (with override chars)
-                        'modified_text': chunk_text,  # Text that was generated (with spaces)
+                        'text': modified_chunk,  # Text that was generated (with spaces)
+                        'original_text': original_chunk,  # Original text (with override chars)
                         'strokes': chunk_stroke,
                         'char_indices': char_indices,  # Attention-based character indices
-                        'override_positions': chunk_override_positions,  # [(local_idx, char), ...]
-                        'chunk_start': chunk_start,
-                        'chunk_end': chunk_end,
+                        'override_positions': chunk_overrides,  # [(local_idx, char), ...]
                     }
 
                     if potential_width <= max_line_width or current_line_width == 0:
@@ -542,7 +526,7 @@ def write_chunked(
                             )
                         else:
                             current_line_stroke = chunk_stroke
-                        current_line_text.append(input_line[chunk_start:chunk_end])
+                        current_line_text.append(original_chunk)
                         current_line_segment_list.append(segment)
                         current_line_width = potential_width
                     else:
@@ -553,7 +537,7 @@ def write_chunked(
                             all_line_segment_data.append(current_line_segment_list)
 
                         current_line_stroke = chunk_stroke
-                        current_line_text = [input_line[chunk_start:chunk_end]]
+                        current_line_text = [original_chunk]
                         current_line_segment_list = [segment]
                         current_line_width = effective_chunk_width
 

From febadcc5ee4da34d384795aa79da9afcade08b4f Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 21 Jan 2026 12:50:09 +1300
Subject: [PATCH 14/21] Refactor style offset logic: adjust override position
 calculations, refine character index handling, and enhance debug logs for
 stroke generation with styles.

---
 handwriting_synthesis/hand/Hand.py  | 34 ++++++++++++++++++++++++-----
 handwriting_synthesis/hand/_draw.py | 10 ++++++++-
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 85bc186..8f326b7 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -458,7 +458,22 @@ def write_chunked(
                                 f"Valid character set is {valid_char_set}"
                             )
 
-                # STEP 4: Generate strokes for modified chunks WITH char_indices
+                # STEP 4: Calculate style offset (char_indices are offset when styles are used)
+                # When styles are used, text is prepended: "style_chars" + " " + actual_text
+                # So char_indices for actual text start at len(style_chars) + 1
+                style_char_offset = 0
+                if styles is not None:
+                    try:
+                        from handwriting_synthesis.config import style_path
+                        style_id = styles if not isinstance(styles, list) else styles[0]
+                        style_chars = np.load(f"{style_path}/style-{style_id}-chars.npy").tostring().decode('utf-8')
+                        style_char_offset = len(style_chars) + 1  # +1 for the space separator
+                        print(f"DEBUG: Style priming active, char_indices offset = {style_char_offset}")
+                    except Exception as e:
+                        print(f"DEBUG: Could not determine style offset: {e}")
+                        style_char_offset = 0
+
+                # STEP 5: Generate strokes for modified chunks WITH char_indices
                 chunk_strokes, chunk_char_indices = self._sample(
                     modified_chunks,
                     biases=[biases] * len(modified_chunks) if biases is not None else None,
@@ -468,7 +483,7 @@ def write_chunked(
 
                 print(f"DEBUG: Generated {len(modified_chunks)} chunks with char_indices")
 
-                # STEP 5: Build segment data with override info for each chunk
+                # STEP 6: Build segment data with override info for each chunk
                 # Stitch chunks into lines based on actual widths
                 current_line_stroke = np.empty((0, 3))
                 current_line_text = []
@@ -479,9 +494,16 @@ def write_chunked(
                     zip(original_chunks, modified_chunks, chunk_strokes, chunk_char_indices, chunk_override_info)
                 ):
                     has_overrides = len(chunk_overrides) > 0
-                    print(f"DEBUG: Processing chunk {chunk_idx} '{modified_chunk}': has_overrides={has_overrides}, positions={chunk_overrides}")
+
+                    # Adjust override positions for style offset
+                    # char_indices from the model include the style prime, so we need to add the offset
+                    adjusted_overrides = [(local_idx + style_char_offset, char) for local_idx, char in chunk_overrides]
+
+                    print(f"DEBUG: Processing chunk {chunk_idx} '{modified_chunk}': has_overrides={has_overrides}")
+                    print(f"DEBUG:   Original positions: {chunk_overrides}")
+                    print(f"DEBUG:   Adjusted positions (with style offset {style_char_offset}): {adjusted_overrides}")
                     if has_overrides:
-                        print(f"DEBUG:   char_indices range: [{char_indices.min()}, {char_indices.max()}], len={len(char_indices)}")
+                        print(f"DEBUG:   char_indices range: [{char_indices.min()}, {char_indices.max()}], unique values: {np.unique(char_indices)[:20]}...")
 
                     # Calculate chunk width (including estimated override widths)
                     chunk_width = get_stroke_width(chunk_stroke)
@@ -505,14 +527,14 @@ def write_chunked(
 
                     # Build segment data
                     # NOTE: 'text' is the MODIFIED chunk (what was generated)
-                    # override_positions are LOCAL indices within this chunk
+                    # override_positions are ADJUSTED for style offset (to match char_indices)
                     segment = {
                         'type': 'generated',
                         'text': modified_chunk,  # Text that was generated (with spaces)
                         'original_text': original_chunk,  # Original text (with override chars)
                         'strokes': chunk_stroke,
                         'char_indices': char_indices,  # Attention-based character indices
-                        'override_positions': chunk_overrides,  # [(local_idx, char), ...]
+                        'override_positions': adjusted_overrides,  # [(adjusted_idx, char), ...] - ADJUSTED for style offset
                     }
 
                     if potential_width <= max_line_width or current_line_width == 0:
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 2e013db..6bec9d0 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -302,11 +302,18 @@ def _render_strokes_with_overrides(
             exclusion_range = None  # Expanded range for excluding transition strokes
 
             if use_precise_indices:
+                # DEBUG: Show what we're looking for vs what's available
+                unique_indices = np.unique(char_indices)
+                print(f"DEBUG: Looking for char_idx={char_idx} in char_indices")
+                print(f"DEBUG:   char_indices unique values: {unique_indices}")
+                print(f"DEBUG:   char_indices range: [{char_indices.min()}, {char_indices.max()}]")
+
                 matching_strokes = np.where(char_indices == char_idx)[0]
                 if len(matching_strokes) > 0:
                     start_idx = matching_strokes[0]
                     end_idx = matching_strokes[-1]
                     stroke_range = (start_idx, end_idx)
+                    print(f"DEBUG:   FOUND {len(matching_strokes)} matching strokes at indices [{start_idx}, {end_idx}]")
 
                     # SIMPLIFIED EXCLUSION: Only exclude the space placeholder strokes
                     # Don't aggressively cut into adjacent characters - this was causing
@@ -327,8 +334,9 @@ def _render_strokes_with_overrides(
                           f"strokes [{expanded_start}, {expanded_end}] ({num_excluded} strokes)")
 
                     insertion_x = ls[start_idx, 0]
+                    print(f"DEBUG:   Insertion X position: {insertion_x:.2f}")
                 else:
-                    print(f"DEBUG: No matching strokes for char_idx={char_idx}, falling back to position estimate")
+                    print(f"DEBUG:   NOT FOUND! char_idx={char_idx} not in char_indices. Falling back to position estimate.")
                     insertion_x = stroke_min_x + (char_idx * avg_char_width)
                     exclusion_range = None
             else:

From 3593856c3b3b957ea55798e9f73e8bfef351421c Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 21 Jan 2026 12:58:27 +1300
Subject: [PATCH 15/21] Refactor char_indices offset handling: remove
 style-specific offset logic, implement auto-detection from data, and refine
 override position calculations with enhanced debug logs.

---
 handwriting_synthesis/hand/Hand.py | 31 +++++++++++-------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 8f326b7..9a8b482 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -458,22 +458,8 @@ def write_chunked(
                                 f"Valid character set is {valid_char_set}"
                             )
 
-                # STEP 4: Calculate style offset (char_indices are offset when styles are used)
-                # When styles are used, text is prepended: "style_chars" + " " + actual_text
-                # So char_indices for actual text start at len(style_chars) + 1
-                style_char_offset = 0
-                if styles is not None:
-                    try:
-                        from handwriting_synthesis.config import style_path
-                        style_id = styles if not isinstance(styles, list) else styles[0]
-                        style_chars = np.load(f"{style_path}/style-{style_id}-chars.npy").tostring().decode('utf-8')
-                        style_char_offset = len(style_chars) + 1  # +1 for the space separator
-                        print(f"DEBUG: Style priming active, char_indices offset = {style_char_offset}")
-                    except Exception as e:
-                        print(f"DEBUG: Could not determine style offset: {e}")
-                        style_char_offset = 0
-
-                # STEP 5: Generate strokes for modified chunks WITH char_indices
+                # STEP 4: Generate strokes for modified chunks WITH char_indices
+                # NOTE: char_indices offset is detected automatically from min(char_indices) in each chunk
                 chunk_strokes, chunk_char_indices = self._sample(
                     modified_chunks,
                     biases=[biases] * len(modified_chunks) if biases is not None else None,
@@ -495,13 +481,18 @@ def write_chunked(
                 ):
                     has_overrides = len(chunk_overrides) > 0
 
-                    # Adjust override positions for style offset
-                    # char_indices from the model include the style prime, so we need to add the offset
-                    adjusted_overrides = [(local_idx + style_char_offset, char) for local_idx, char in chunk_overrides]
+                    # CRITICAL FIX: Detect the actual char_indices offset from the data itself
+                    # char_indices from the model start at min(char_indices), not 0
+                    # This accounts for style priming and any other offsets automatically
+                    actual_offset = int(char_indices.min()) if len(char_indices) > 0 else 0
+
+                    # Adjust override positions using the detected offset
+                    adjusted_overrides = [(local_idx + actual_offset, char) for local_idx, char in chunk_overrides]
 
                     print(f"DEBUG: Processing chunk {chunk_idx} '{modified_chunk}': has_overrides={has_overrides}")
                     print(f"DEBUG:   Original positions: {chunk_overrides}")
-                    print(f"DEBUG:   Adjusted positions (with style offset {style_char_offset}): {adjusted_overrides}")
+                    print(f"DEBUG:   Detected char_indices offset: {actual_offset}")
+                    print(f"DEBUG:   Adjusted positions: {adjusted_overrides}")
                     if has_overrides:
                         print(f"DEBUG:   char_indices range: [{char_indices.min()}, {char_indices.max()}], unique values: {np.unique(char_indices)[:20]}...")
 

From 72d80cc2422e8e4ea90e3f1fe533d7bc209aa86e Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 21 Jan 2026 13:03:08 +1300
Subject: [PATCH 16/21] Refactor override logic: remove stroke exclusion,
 implement X-position-based shifting for gap creation, and simplify rendering
 logic by removing exclusion zones.

---
 handwriting_synthesis/hand/_draw.py | 82 ++++++++---------------------
 1 file changed, 22 insertions(+), 60 deletions(-)

diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 6bec9d0..a0cfdec 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -305,7 +305,6 @@ def _render_strokes_with_overrides(
                 # DEBUG: Show what we're looking for vs what's available
                 unique_indices = np.unique(char_indices)
                 print(f"DEBUG: Looking for char_idx={char_idx} in char_indices")
-                print(f"DEBUG:   char_indices unique values: {unique_indices}")
                 print(f"DEBUG:   char_indices range: [{char_indices.min()}, {char_indices.max()}]")
 
                 matching_strokes = np.where(char_indices == char_idx)[0]
@@ -313,34 +312,24 @@ def _render_strokes_with_overrides(
                     start_idx = matching_strokes[0]
                     end_idx = matching_strokes[-1]
                     stroke_range = (start_idx, end_idx)
-                    print(f"DEBUG:   FOUND {len(matching_strokes)} matching strokes at indices [{start_idx}, {end_idx}]")
-
-                    # SIMPLIFIED EXCLUSION: Only exclude the space placeholder strokes
-                    # Don't aggressively cut into adjacent characters - this was causing
-                    # visible artifacts by removing actual character strokes.
-                    expanded_start = start_idx
-                    expanded_end = end_idx
-
-                    # Only extend to include pen-up transitions at boundaries
-                    # This catches connecting strokes that are part of the transition
-                    while expanded_start > 0 and ls[expanded_start - 1, 2] > 0.5:
-                        expanded_start -= 1
-                    while expanded_end < ls.shape[0] - 1 and ls[expanded_end, 2] > 0.5:
-                        expanded_end += 1
-
-                    exclusion_range = (expanded_start, expanded_end)
-                    num_excluded = expanded_end - expanded_start + 1
-                    print(f"DEBUG: Exclusion for '{override_char}' (char_idx={char_idx}): "
-                          f"strokes [{expanded_start}, {expanded_end}] ({num_excluded} strokes)")
 
+                    # Get the X position at the START of the space placeholder strokes
                     insertion_x = ls[start_idx, 0]
+                    print(f"DEBUG:   FOUND {len(matching_strokes)} matching strokes at indices [{start_idx}, {end_idx}]")
                     print(f"DEBUG:   Insertion X position: {insertion_x:.2f}")
+
+                    # NEW APPROACH: Don't exclude strokes! The char_indices boundaries are fuzzy.
+                    # Instead, we'll use X-position based shifting to create a gap.
+                    # Set exclusion_range to None to disable stroke exclusion.
+                    exclusion_range = None
                 else:
                     print(f"DEBUG:   NOT FOUND! char_idx={char_idx} not in char_indices. Falling back to position estimate.")
-                    insertion_x = stroke_min_x + (char_idx * avg_char_width)
+                    insertion_x = stroke_min_x + ((char_idx - char_indices.min()) * avg_char_width)
+                    stroke_range = None
                     exclusion_range = None
             else:
                 insertion_x = stroke_min_x + (char_idx * avg_char_width)
+                stroke_range = None
                 exclusion_range = None
 
             override_info.append({
@@ -376,25 +365,18 @@ def _render_strokes_with_overrides(
     for info in override_info:
         char_idx = info['char_idx']
         override_width = info['override_width']
-        exclusion_range = info.get('exclusion_range') or info.get('stroke_range')
+        exclusion_range = info.get('exclusion_range')  # Only use explicit exclusion_range, NOT stroke_range
 
         # Add small spacing around override (like natural character spacing)
         spacing = avg_char_width * 0.15
         total_shift = override_width + spacing * 2
 
-        if use_precise_indices and exclusion_range is not None:
-            start_idx, end_idx = exclusion_range
-            # Add all strokes in exclusion range to the set
-            for idx in range(start_idx, end_idx + 1):
-                excluded_stroke_indices.add(idx)
-            # Shift all strokes AFTER the exclusion range
-            cumulative_shift[end_idx + 1:] += total_shift
-            print(f"DEBUG: Excluding strokes [{start_idx}, {end_idx}], shifting after by {total_shift:.2f}")
-        else:
-            # Fallback: shift based on X position
-            insertion_x = info['insertion_x']
-            mask = ls[:, 0] > insertion_x
-            cumulative_shift[mask] += total_shift
+        # ALWAYS use X-position based shifting - this is more reliable than stroke exclusion
+        # The char_indices boundaries are fuzzy and excluding strokes cuts into adjacent chars
+        insertion_x = info['insertion_x']
+        mask = ls[:, 0] > insertion_x
+        cumulative_shift[mask] += total_shift
+        print(f"DEBUG: X-position shift at {insertion_x:.2f}, shifting {np.sum(mask)} strokes by {total_shift:.2f}")
 
     # Apply shifts to X coordinates
     ls_shifted[:, 0] += cumulative_shift
@@ -411,31 +393,11 @@ def _render_strokes_with_overrides(
     prev_eos = 1.0
     commands = []
 
-    if use_precise_indices:
-        # Use the expanded exclusion set (includes transition strokes)
-        for stroke_idx, (x, y, eos) in enumerate(zip(*ls_render.T)):
-            if stroke_idx in excluded_stroke_indices:
-                # Skip this stroke, mark as stroke break
-                prev_eos = 1.0
-            else:
-                commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
-                prev_eos = eos
-    else:
-        # Fallback using exclusion zones
-        exclusion_zones = []
-        for info in override_info:
-            zone_start = info['insertion_x'] - shifted_min_x
-            zone_end = zone_start + info['override_width']
-            exclusion_zones.append((zone_start, zone_end))
-
-        for x, y, eos in zip(*ls_render.T):
-            rel_x = x - cursor_x
-            in_exclusion = any(start <= rel_x <= end for start, end in exclusion_zones)
-            if in_exclusion:
-                prev_eos = 1.0
-            else:
-                commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
-                prev_eos = eos
+    # RENDER ALL STROKES - no exclusion!
+    # We use X-position shifting to create gaps, so all strokes are valid
+    for x, y, eos in zip(*ls_render.T):
+        commands.append('{}{},{}'.format('M' if prev_eos == 1.0 else 'L', x, y))
+        prev_eos = eos
 
     if commands:
         p = ' '.join(commands)

From f3160380aeb9e2144e82dc9d7f0ab1388046c3d7 Mon Sep 17 00:00:00 2001
From: Arie Joe <ariej00@outlook.com>
Date: Wed, 21 Jan 2026 20:24:26 +1300
Subject: [PATCH 17/21] Refactor precise index handling: enhance stroke
 matching with threshold-based character searches, refine override
 positioning, and improve debug logging for gap and shift calculations.

---
 handwriting_synthesis/hand/_draw.py | 108 ++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 28 deletions(-)

diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index a0cfdec..8f9fa6c 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -302,31 +302,61 @@ def _render_strokes_with_overrides(
             exclusion_range = None  # Expanded range for excluding transition strokes
 
             if use_precise_indices:
-                # DEBUG: Show what we're looking for vs what's available
-                unique_indices = np.unique(char_indices)
                 print(f"DEBUG: Looking for char_idx={char_idx} in char_indices")
                 print(f"DEBUG:   char_indices range: [{char_indices.min()}, {char_indices.max()}]")
 
-                matching_strokes = np.where(char_indices == char_idx)[0]
-                if len(matching_strokes) > 0:
-                    start_idx = matching_strokes[0]
-                    end_idx = matching_strokes[-1]
-                    stroke_range = (start_idx, end_idx)
-
-                    # Get the X position at the START of the space placeholder strokes
-                    insertion_x = ls[start_idx, 0]
-                    print(f"DEBUG:   FOUND {len(matching_strokes)} matching strokes at indices [{start_idx}, {end_idx}]")
-                    print(f"DEBUG:   Insertion X position: {insertion_x:.2f}")
-
-                    # NEW APPROACH: Don't exclude strokes! The char_indices boundaries are fuzzy.
-                    # Instead, we'll use X-position based shifting to create a gap.
-                    # Set exclusion_range to None to disable stroke exclusion.
-                    exclusion_range = None
+                # IMPROVED APPROACH: Find characters with SUFFICIENT strokes (not just immediate neighbors)
+                # Spaces may have very few strokes, so we search outward until we find substantial characters
+                min_strokes_threshold = 3  # Require at least this many strokes to be reliable
+
+                # Search backwards for previous substantial character
+                prev_strokes = np.array([], dtype=int)
+                for search_idx in range(char_idx - 1, int(char_indices.min()) - 1, -1):
+                    candidate_strokes = np.where(char_indices == search_idx)[0]
+                    if len(candidate_strokes) >= min_strokes_threshold:
+                        prev_strokes = candidate_strokes
+                        print(f"DEBUG:   Found prev char at idx {search_idx} with {len(candidate_strokes)} strokes")
+                        break
+
+                # Search forwards for next substantial character
+                next_strokes = np.array([], dtype=int)
+                for search_idx in range(char_idx + 1, int(char_indices.max()) + 1):
+                    candidate_strokes = np.where(char_indices == search_idx)[0]
+                    if len(candidate_strokes) >= min_strokes_threshold:
+                        next_strokes = candidate_strokes
+                        print(f"DEBUG:   Found next char at idx {search_idx} with {len(candidate_strokes)} strokes")
+                        break
+
+                if len(prev_strokes) > 0 and len(next_strokes) > 0:
+                    # Get the X position at the END of previous character
+                    prev_end_x = ls[prev_strokes[-1], 0]
+                    # Get the X position at the START of next character
+                    next_start_x = ls[next_strokes[0], 0]
+                    # Insert closer to the start of the next character (leave room for any space)
+                    # Weight towards next_start_x since we want override right before the number/letter
+                    insertion_x = prev_end_x + (next_start_x - prev_end_x) * 0.3
+                    stroke_range = (prev_strokes[-1], next_strokes[0])
+                    print(f"DEBUG:   Using BETWEEN approach: prev ends at {prev_end_x:.2f}, next starts at {next_start_x:.2f}")
+                    print(f"DEBUG:   Insertion X position: {insertion_x:.2f} (30% into gap)")
+                elif len(prev_strokes) > 0:
+                    # Only have previous character - insert after it
+                    prev_end_x = ls[prev_strokes[-1], 0]
+                    insertion_x = prev_end_x + avg_char_width * 0.3
+                    stroke_range = (prev_strokes[-1], prev_strokes[-1])
+                    print(f"DEBUG:   Using AFTER-PREV approach: inserting after {prev_end_x:.2f}")
+                elif len(next_strokes) > 0:
+                    # Only have next character - insert before it
+                    next_start_x = ls[next_strokes[0], 0]
+                    insertion_x = next_start_x - avg_char_width * 0.3
+                    stroke_range = (next_strokes[0], next_strokes[0])
+                    print(f"DEBUG:   Using BEFORE-NEXT approach: inserting before {next_start_x:.2f}")
                 else:
-                    print(f"DEBUG:   NOT FOUND! char_idx={char_idx} not in char_indices. Falling back to position estimate.")
+                    # Fallback to position estimate
+                    print(f"DEBUG:   No adjacent chars found. Falling back to position estimate.")
                     insertion_x = stroke_min_x + ((char_idx - char_indices.min()) * avg_char_width)
                     stroke_range = None
-                    exclusion_range = None
+
+                exclusion_range = None
             else:
                 insertion_x = stroke_min_x + (char_idx * avg_char_width)
                 stroke_range = None
@@ -365,15 +395,34 @@ def _render_strokes_with_overrides(
     for info in override_info:
         char_idx = info['char_idx']
         override_width = info['override_width']
-        exclusion_range = info.get('exclusion_range')  # Only use explicit exclusion_range, NOT stroke_range
+        stroke_range = info.get('stroke_range')
 
         # Add small spacing around override (like natural character spacing)
-        spacing = avg_char_width * 0.15
-        total_shift = override_width + spacing * 2
+        spacing = avg_char_width * 0.1  # Reduced from 0.15
+
+        # Calculate the existing gap width (space placeholder takes some natural width)
+        insertion_x = info['insertion_x']
+
+        # Get the existing space width from the stroke range
+        if stroke_range is not None:
+            prev_stroke_idx, next_stroke_idx = stroke_range
+            # The existing gap is from end of prev char to start of next char
+            existing_gap = ls[next_stroke_idx, 0] - ls[prev_stroke_idx, 0]
+        else:
+            existing_gap = avg_char_width * 0.5  # Fallback estimate
+
+        # Only shift by the ADDITIONAL space needed beyond what's already there
+        # We want: existing_gap -> override_width + small_spacing
+        extra_needed = (override_width + spacing) - existing_gap
+        total_shift = max(0, extra_needed)
+
+        print(f"DEBUG: existing_gap={existing_gap:.2f}, override_width={override_width:.2f}, extra_needed={extra_needed:.2f}")
+
+        # Store for SVG positioning
+        info['existing_gap'] = existing_gap
 
         # ALWAYS use X-position based shifting - this is more reliable than stroke exclusion
         # The char_indices boundaries are fuzzy and excluding strokes cuts into adjacent chars
-        insertion_x = info['insertion_x']
         mask = ls[:, 0] > insertion_x
         cumulative_shift[mask] += total_shift
         print(f"DEBUG: X-position shift at {insertion_x:.2f}, shifting {np.sum(mask)} strokes by {total_shift:.2f}")
@@ -413,18 +462,21 @@ def _render_strokes_with_overrides(
         override_data = info['override_data']
         override_width = info['override_width']
         stroke_range = info['stroke_range']
+        existing_gap = info.get('existing_gap', avg_char_width * 0.5)
 
-        spacing = avg_char_width * 0.15
+        # Small spacing before override (consistent with shift calculation)
+        spacing = avg_char_width * 0.05  # Small gap before override
 
         # Calculate position accounting for previous shifts
         if use_precise_indices and stroke_range is not None:
-            start_idx, end_idx = stroke_range
-            # Use the shifted position
-            base_x = ls_shifted[start_idx, 0] - shifted_min_x + cursor_x
+            prev_stroke_idx, next_stroke_idx = stroke_range
+            # Position after the previous character ends (in shifted coordinates)
+            prev_end_x_shifted = ls_shifted[prev_stroke_idx, 0]
+            base_x = prev_end_x_shifted - shifted_min_x + cursor_x
         else:
             base_x = info['insertion_x'] - stroke_min_x + cursor_x + running_shift
 
-        # Add spacing before the override
+        # Place override with small spacing after previous character
         override_start_x = base_x + spacing
 
         # Position override SVG

From 628e7d20d9bb5568b5380a09a7c1782ff03ebd9c Mon Sep 17 00:00:00 2001
From: ariedotcodotnz <ariedotcodotnz@users.noreply.github.com>
Date: Sun, 31 May 2026 04:08:24 +0000
Subject: [PATCH 18/21] fix(chunking): correct 'balanced' strategy, honor
 target_chars, split long words

Improvements to text chunking (operations/chunking.py) -- the step that splits
text before generation and drives how lines fill. No model retraining.

- Fix 'balanced' strategy: its punctuation branch sat behind an if/elif on
  overlapping sets and was unreachable, so the DEFAULT strategy never broke at
  commas/semicolons. 'balanced' now considers sentence + punctuation breaks.
- Honor target_chars_per_chunk: it was accepted everywhere but never used. The
  character budget now bounds the break-search window so breaks land on real
  punctuation that also fits the target, giving even line filling.
- Hard-split pathological space-less tokens (URLs, long ids) so they cannot
  exceed the model's sequence limit or run off the page (lossless reassembly).
- Guarantee loop progress (>=1 word/iter) under degenerate params.

Adds model-free tests (tests/test_operations.py) covering all strategies, the
character budget, long-word splitting, and whitespace handling. 8/8 passing.

Note: the original local work also touched _draw.py/Hand.py (override spacing,
per-line auto-size). Those were dropped during rebase because origin/develop
independently rewrote that override/layout subsystem (char_indices,
effective_target_h) and supersedes them; this commit keeps only the
upstream-untouched chunking improvements.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../hand/operations/chunking.py               | 220 ++++++++++++------
 tests/test_operations.py                      | 117 ++++++++++
 2 files changed, 262 insertions(+), 75 deletions(-)
 create mode 100644 tests/test_operations.py

diff --git a/handwriting_synthesis/hand/operations/chunking.py b/handwriting_synthesis/hand/operations/chunking.py
index 175ce6e..ba1427e 100644
--- a/handwriting_synthesis/hand/operations/chunking.py
+++ b/handwriting_synthesis/hand/operations/chunking.py
@@ -1,6 +1,68 @@
 """Text chunking logic for improved handwriting generation."""
 
-from typing import List
+from typing import List, Optional
+
+
+# Tokens that mark the end of a sentence -- strong, high-priority break points.
+_SENTENCE_ENDERS = ('.', '!', '?')
+# Softer break points; a comma/semicolon is a natural place to end a chunk.
+_PUNCTUATION_BREAKS = (',', ';', ':')
+
+
+def _hard_split_long_word(word: str, max_chars: int) -> List[str]:
+    """Break a single token longer than ``max_chars`` into ``<= max_chars`` pieces.
+
+    Normal words are returned unchanged. This only triggers for pathological
+    tokens (URLs, long identifiers, base64 blobs) that would otherwise create an
+    over-long RNN sequence or run off the edge of the page, since such a token
+    contains no spaces for the wrapper to break on.
+
+    Args:
+        word: The token to (possibly) split.
+        max_chars: Maximum characters allowed per piece.
+
+    Returns:
+        List of one or more sub-tokens, each at most ``max_chars`` long.
+    """
+    if max_chars <= 0 or len(word) <= max_chars:
+        return [word]
+    return [word[i:i + max_chars] for i in range(0, len(word), max_chars)]
+
+
+def _find_break_point(
+    words: List[str],
+    start: int,
+    search_lo: int,
+    search_hi: int,
+    use_sentence: bool,
+    use_punctuation: bool,
+) -> Optional[int]:
+    """Find a punctuation-based chunk boundary within a word range.
+
+    Sentence terminators take priority and break as early as possible (so a
+    sentence becomes its own chunk). Failing that, the *latest* soft punctuation
+    break in range is used, which fills the chunk as much as possible while still
+    ending on a natural pause.
+
+    Args:
+        words: Full list of words.
+        start: Index of the first word in the current chunk.
+        search_lo: First word index to consider as a break point.
+        search_hi: One past the last word index to consider.
+        use_sentence: Whether to break on sentence terminators (. ! ?).
+        use_punctuation: Whether to break on soft punctuation (, ; :).
+
+    Returns:
+        A word count for the chunk if a break was found, otherwise ``None``.
+    """
+    punctuation_break = None
+    for j in range(search_lo, search_hi):
+        word = words[j]
+        if use_sentence and word.endswith(_SENTENCE_ENDERS):
+            return j - start + 1
+        if use_punctuation and word.endswith(_PUNCTUATION_BREAKS):
+            punctuation_break = j - start + 1  # keep the latest one in range
+    return punctuation_break
 
 
 def split_text_into_chunks(
@@ -19,19 +81,22 @@ def split_text_into_chunks(
     - 'word_length': Adjusts based on average word length (original behavior)
     - 'sentence': Respects sentence boundaries (periods, !, ?)
     - 'punctuation': Prefers to break at punctuation marks (commas, semicolons)
-    - 'balanced': Combines word length + punctuation awareness
+    - 'balanced': Combines word length + sentence + punctuation awareness
     - 'off': Fixed chunk sizes (no adaptation)
 
     This method creates more natural chunks by:
     1. Using more words if they're short (better context for the model)
     2. Using fewer words if they're long (avoid exceeding limits)
     3. Respecting sentence and punctuation boundaries when enabled
-    4. Ensuring reasonable min/max bounds
+    4. Keeping chunk length near ``target_chars_per_chunk`` for even line filling
+    5. Ensuring reasonable min/max bounds
 
     Args:
         text: Input text to split.
         words_per_chunk: Target number of words per chunk (used as baseline).
-        target_chars_per_chunk: Target character count per chunk (default: 25).
+        target_chars_per_chunk: Soft upper bound on characters per chunk. Chunks
+            are trimmed back toward this length (never below ``min_words``) so the
+            generated pieces stay a consistent size.
         min_words: Minimum words per chunk.
         max_words: Maximum words per chunk.
         adaptive_chunking: Enable adaptive chunking.
@@ -44,12 +109,36 @@ def split_text_into_chunks(
     leading_space = len(text) - len(text.lstrip())
     trailing_space = len(text) - len(text.rstrip())
 
-    words = text.split()
-    if not words:
+    raw_words = text.split()
+    if not raw_words:
         # If only whitespace, return it as-is
         return [text] if text else []
 
-    # Non-adaptive mode: fixed chunk sizes
+    # Character budgets. The soft cap keeps chunks near the requested target; the
+    # hard cap only breaks pathological space-less tokens so they cannot blow past
+    # the model's sequence limit. A normal long word (e.g. "internationalization")
+    # stays intact because it is shorter than the hard cap.
+    soft_char_cap = max(1, int(target_chars_per_chunk))
+    hard_word_cap = max(soft_char_cap * 2, 40)
+
+    # Pre-split any token that is, on its own, longer than the hard cap. For normal
+    # text this is a no-op, so word-based logic below is unchanged.
+    words: List[str] = []
+    for w in raw_words:
+        words.extend(_hard_split_long_word(w, hard_word_cap))
+
+    def _chunk_char_len(start: int, count: int) -> int:
+        """Character length of ``count`` words joined with single spaces."""
+        return len(' '.join(words[start:start + count]))
+
+    def _fit_to_char_budget(start: int, count: int) -> int:
+        """Shrink ``count`` so the chunk fits the soft char cap (keeps >= min_words)."""
+        lower_bound = min(min_words, len(words) - start)
+        while count > lower_bound and _chunk_char_len(start, count) > soft_char_cap:
+            count -= 1
+        return max(1, count)
+
+    # Non-adaptive mode: fixed chunk sizes (still honours the hard word cap above).
     if not adaptive_chunking or adaptive_strategy == 'off':
         chunks = []
         for i in range(0, len(words), words_per_chunk):
@@ -66,83 +155,64 @@ def split_text_into_chunks(
     chunks = []
     i = 0
 
-    # Sentence boundary markers
-    sentence_enders = {'.', '!', '?'}
-    punctuation_breaks = {',', ';', ':', '--'}
+    use_sentence = adaptive_strategy in ('sentence', 'balanced')
+    use_punctuation = adaptive_strategy in ('punctuation', 'balanced')
+    use_word_length = adaptive_strategy in ('word_length', 'balanced')
+    # A sentence terminator is a strong break for ANY punctuation-aware strategy
+    # (sentence / punctuation / balanced); soft commas/semicolons only break when
+    # punctuation awareness is on. NOTE: 'balanced' must consider both -- the old
+    # if/elif on overlapping sets made the punctuation branch unreachable for it
+    # (and 'balanced' is the default strategy).
+    break_on_sentence = use_sentence or use_punctuation
 
     while i < len(words):
-        # Start with the target words per chunk
-        chunk_word_count = words_per_chunk
-
-        # Look ahead to see the average word length
-        lookahead_end = min(i + words_per_chunk * 2, len(words))
-        lookahead_words = words[i:lookahead_end]
-
-        # Word length adaptation (used in word_length and balanced strategies)
-        if adaptive_strategy in ('word_length', 'balanced') and lookahead_words:
-            avg_word_length = sum(len(w) for w in lookahead_words) / len(lookahead_words)
+        remaining = len(words) - i
 
-            # Adjust chunk size based on word length
-            if avg_word_length < 4:  # Short words (a, an, the, is, of, etc.)
-                # Use more words to provide better context
-                chunk_word_count = min(max_words, int(words_per_chunk * 1.5))
-            elif avg_word_length > 7:  # Long words
-                # Use fewer words to avoid too long chunks
-                chunk_word_count = max(min_words, int(words_per_chunk * 0.75))
-
-        # Ensure we stay within bounds
-        chunk_word_count = max(min_words, min(max_words, chunk_word_count))
-
-        # Don't exceed remaining words
-        chunk_word_count = min(chunk_word_count, len(words) - i)
-
-        # Sentence-aware chunking (sentence and balanced strategies)
-        if adaptive_strategy in ('sentence', 'balanced'):
-            # Look for sentence boundaries within our chunk range
-            search_end = min(i + max_words, len(words))
-            for j in range(i + min_words, search_end):
-                word = words[j]
-                # Check if word ends with sentence terminator
-                if any(word.endswith(char) for char in sentence_enders):
-                    # Found sentence end, use this as chunk boundary
-                    chunk_word_count = j - i + 1
-                    break
-
-        # Punctuation-aware chunking (punctuation and balanced strategies)
-        elif adaptive_strategy in ('punctuation', 'balanced'):
-            # Look for punctuation breaks within our chunk range
-            search_start = i + min_words
-            search_end = min(i + chunk_word_count + 2, len(words))
-            best_break = None
-
-            for j in range(search_start, search_end):
-                word = words[j]
-                # Check for sentence enders first (higher priority)
-                if any(word.endswith(char) for char in sentence_enders):
-                    best_break = j - i + 1
-                    break
-                # Check for punctuation breaks (lower priority)
-                elif any(word.endswith(char) for char in punctuation_breaks):
-                    best_break = j - i + 1
-
-            if best_break:
-                chunk_word_count = best_break
-
-        # Final bounds check
+        # 1. Baseline chunk size, optionally adapted to average word length so that
+        #    short words pack more per chunk and long words pack fewer.
+        chunk_word_count = words_per_chunk
+        if use_word_length:
+            lookahead_words = words[i:min(i + words_per_chunk * 2, len(words))]
+            if lookahead_words:
+                avg_word_length = sum(len(w) for w in lookahead_words) / len(lookahead_words)
+                if avg_word_length < 4:                       # short words
+                    chunk_word_count = min(max_words, int(words_per_chunk * 1.5))
+                elif avg_word_length > 7:                     # long words
+                    chunk_word_count = max(min_words, int(words_per_chunk * 0.75))
         chunk_word_count = max(min_words, min(max_words, chunk_word_count))
-        chunk_word_count = min(chunk_word_count, len(words) - i)
+        chunk_word_count = min(chunk_word_count, remaining)
+
+        # 2. Character budget: the most words that still fit the soft cap. This
+        #    bounds everything below so chunks stay near target_chars_per_chunk.
+        budget_max = _fit_to_char_budget(i, min(max_words, remaining))
+
+        # 3. Prefer a natural break (sentence/punctuation) *within* the budget
+        #    window, so the break lands on real punctuation that also fits the
+        #    target -- rather than trimming a good break back to mid-phrase.
+        if break_on_sentence:
+            search_lo = i + min_words
+            search_hi = i + min(budget_max, remaining)
+            break_point = _find_break_point(
+                words, i, search_lo, search_hi, break_on_sentence, use_punctuation
+            )
+            if break_point:
+                chunk_word_count = break_point
+            else:
+                # No natural break in range: keep the baseline size, capped by budget.
+                chunk_word_count = min(chunk_word_count, budget_max)
+        else:
+            # word_length / off: no punctuation awareness, just honour the budget.
+            chunk_word_count = min(chunk_word_count, budget_max)
+
+        # Final bounds: never below the word floor, never past the remaining words,
+        # and always at least one word so the loop is guaranteed to make progress.
+        chunk_word_count = min(max(min_words, chunk_word_count), remaining)
+        chunk_word_count = max(1, chunk_word_count)
 
         # Create the chunk
         chunk_words = words[i:i + chunk_word_count]
         chunk_text = ' '.join(chunk_words)
 
-        # If chunk is too long (> 50 chars), split it
-        if len(chunk_text) > 50 and len(chunk_words) > min_words:
-            # Use fewer words
-            chunk_word_count = max(min_words, len(chunk_words) // 2)
-            chunk_words = words[i:i + chunk_word_count]
-            chunk_text = ' '.join(chunk_words)
-
         # Add leading space to first chunk
         if i == 0 and leading_space > 0:
             chunk_text = ' ' * leading_space + chunk_text
diff --git a/tests/test_operations.py b/tests/test_operations.py
new file mode 100644
index 0000000..25d9d2a
--- /dev/null
+++ b/tests/test_operations.py
@@ -0,0 +1,117 @@
+"""Tests for the text-chunking operation used to split text before generation.
+
+These tests are model-free -- they exercise the pure wrapping/sizing logic and
+never load the RNN -- so they run fast and anywhere. Run with:
+
+    pytest tests/test_operations.py        # if pytest is installed
+    python tests/test_operations.py        # standalone fallback runner
+"""
+
+import os
+import sys
+
+# Make the project importable when run directly (python tests/test_operations.py).
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)))
+
+from handwriting_synthesis.hand.operations.chunking import split_text_into_chunks
+
+
+# Tokens longer than this are hard-split by the chunker (see chunking.py).
+def _hard_cap(target_chars):
+    return max(int(target_chars) * 2, 40)
+
+
+PUNCTUATED = "I went home, then I slept, and later, after dinner, I read a long book."
+
+
+def _max_token_len(chunks):
+    return max((len(tok) for c in chunks for tok in c.split()), default=0)
+
+
+def test_balanced_breaks_at_punctuation():
+    """'balanced' (the default) must use punctuation breaks, not only sentences.
+
+    Regression guard for the unreachable elif branch that previously made the
+    punctuation logic dead code whenever the strategy was 'balanced'.
+    """
+    chunks = split_text_into_chunks(
+        PUNCTUATED, words_per_chunk=3, target_chars_per_chunk=25,
+        adaptive_strategy='balanced',
+    )
+    # At least one boundary lands right after a comma -> punctuation awareness ran.
+    assert any(c.rstrip().endswith(',') for c in chunks), chunks
+
+
+def test_target_chars_is_honoured():
+    """Chunks should stay near the character target (down to the min-word floor)."""
+    target = 20
+    min_words = 2
+    chunks = split_text_into_chunks(
+        PUNCTUATED, words_per_chunk=4, target_chars_per_chunk=target,
+        min_words=min_words, max_words=8, adaptive_strategy='balanced',
+    )
+    for c in chunks:
+        # A chunk may exceed the soft cap only if it is already at the min-word floor.
+        assert len(c) <= target or len(c.split()) <= min_words, (c, len(c))
+
+
+def test_long_word_is_hard_split():
+    """A space-less token longer than the hard cap must be broken up."""
+    target = 25
+    url = "see https://example.com/a/very/long/path/that/keeps/going/and/going/forever/"
+    chunks = split_text_into_chunks(url, words_per_chunk=3, target_chars_per_chunk=target)
+    assert _max_token_len(chunks) <= _hard_cap(target), chunks
+    # Reassembling the tokens must preserve the original characters (no loss).
+    assert "".join("".join(c.split()) for c in chunks) == url.replace(" ", "")
+
+
+def test_normal_long_word_is_not_split():
+    """A legitimately long word (shorter than the hard cap) stays intact."""
+    word = "internationalization"  # 20 chars, under the 50 hard cap
+    chunks = split_text_into_chunks("the " + word + " process", target_chars_per_chunk=25)
+    assert any(word in c for c in chunks), chunks
+
+
+def test_off_strategy_is_fixed_size():
+    chunks = split_text_into_chunks(
+        "one two three four five six seven", words_per_chunk=3, adaptive_strategy='off',
+    )
+    assert chunks == ["one two three", "four five six", "seven"], chunks
+
+
+def test_sentence_strategy_respects_budget():
+    chunks = split_text_into_chunks(
+        PUNCTUATED, words_per_chunk=3, target_chars_per_chunk=25, adaptive_strategy='sentence',
+    )
+    assert all(len(c) <= 25 or len(c.split()) <= 2 for c in chunks), chunks
+
+
+def test_whitespace_and_empty_inputs():
+    assert split_text_into_chunks("") == []
+    assert split_text_into_chunks("   ") == ["   "]
+    lead_trail = split_text_into_chunks("   hello world there   ", words_per_chunk=2)
+    assert lead_trail[0].startswith("   "), lead_trail
+    assert lead_trail[-1].endswith("   "), lead_trail
+
+
+def test_progress_guaranteed_with_degenerate_min_words():
+    """min_words=0 must not cause an infinite loop."""
+    chunks = split_text_into_chunks(
+        "a b c d e", words_per_chunk=2, min_words=0, target_chars_per_chunk=5,
+    )
+    assert "".join("".join(c.split()) for c in chunks) == "abcde", chunks
+
+
+if __name__ == '__main__':
+    tests = [v for k, v in sorted(globals().items())
+             if k.startswith('test_') and callable(v)]
+    failures = 0
+    for fn in tests:
+        try:
+            fn()
+            print(f"PASS  {fn.__name__}")
+        except Exception as exc:  # noqa: BLE001 - report and continue
+            failures += 1
+            print(f"FAIL  {fn.__name__}: {type(exc).__name__}: {exc}")
+    print(f"\n{len(tests) - failures}/{len(tests)} passed")
+    sys.exit(1 if failures else 0)

From 243855fe24bbd13508514c159a736a4a4c0353a2 Mon Sep 17 00:00:00 2001
From: ariedotcodotnz <ariedotcodotnz@users.noreply.github.com>
Date: Sun, 31 May 2026 22:30:33 +0000
Subject: [PATCH 19/21] feat(sizing): introduce writing size parameter for
 natural handwriting scaling

---
 CLAUDE.md                              |  61 +++++++++
 handwriting_synthesis/hand/Hand.py     |  79 ++++++++++-
 handwriting_synthesis/hand/_draw.py    | 178 +++++++++++++++++--------
 tests/test_sizing.py                   | 172 ++++++++++++++++++++++++
 webapp/init_db.py                      | 128 +++++++++++++++---
 webapp/static/js/modules/alpine-app.js |   4 +
 webapp/templates/index.html            |  15 ++-
 webapp/utils/generation_utils.py       |   5 +
 8 files changed, 558 insertions(+), 84 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 tests/test_sizing.py

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..5a22eae
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,61 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## What this is
+
+WriteBot turns digital text into realistic handwritten SVG/PDF using a trained TensorFlow RNN. There are two layers:
+
+- `handwriting_synthesis/` — the ML engine: model loading, RNN stroke generation, and layout → SVG. No Flask dependency; usable standalone.
+- `webapp/` — a multi-user Flask app (auth, batch jobs, character overrides, admin, usage stats) wrapping the engine.
+
+## Commands
+
+Python 3.10 in CI (local `.venv` is 3.11). `pip install -r requirements.txt` pulls TensorFlow + CUDA and is large.
+
+Run the app:
+- Dev: `python webapp/app.py` — Flask dev server on port 5000, **single-threaded by design** (see model note below).
+- Full stack: `docker compose up` — gunicorn (`webapp.app:app`), a Celery worker, Celery beat, Redis, and nginx. The Docker entrypoint runs `python webapp/init_db.py --auto` first to create/seed the DB.
+
+Tests (model-free, fast — these are the only automated tests):
+- All: `python tests/test_operations.py` or `pytest tests/test_operations.py`
+- One: `pytest tests/test_operations.py::test_balanced_breaks_at_punctuation`
+- They cover `handwriting_synthesis/hand/operations/chunking.py`. Anything that touches `Hand` needs the model checkpoint and is slow, so it is not in the suite.
+
+Lint (as CI runs it): `flake8 webapp --select=E9,F63,F7,F82` (hard gate on syntax/undefined names) and `pylint $(git ls-files '*.py')`.
+
+DB migrations (Flask-Migrate/Alembic, `migrations/`): `FLASK_APP=webapp.app:app flask db migrate -m "..."` then `flask db upgrade`.
+
+CLI batch generation: `python scripts/batch_generate.py` (see `test_batch.csv` for the expected column layout).
+
+## The model — read before changing any generation code
+
+- `Hand()` (`handwriting_synthesis/hand/Hand.py`) builds a `tf.compat.v1` graph and restores a checkpoint (`model/checkpoint/model-17900.*`) plus style-priming arrays (`model/style/style-N-*.npy`). Construction is expensive and the TF session is effectively a **process-global singleton**: route modules instantiate `hand = Hand()` at import time, and `webapp/tasks.py` caches a single lazy `_hand_instance`. This is why the dev server is single-threaded and the Celery worker runs `-P solo --concurrency 1`. Do not assume concurrent calls are safe.
+- The model only understands a fixed, restricted ASCII alphabet (`handwriting_synthesis/drawing/operations.py: alphabet`). Input must be normalized to it via `webapp/utils/text_utils.py: normalize_text_for_model` before generation. Limits: `MAX_CHAR_LEN=120` per line, `MAX_STROKE_LEN=2400`. Retraining is out of scope — improvements happen at the chunking / stitching / layout level, not the weights.
+
+## Generation pipeline (the part that requires reading several files)
+
+1. `webapp/routes/{generation,batch,job}_routes.py` receive a request with 30+ parameters.
+2. `webapp/utils/generation_utils.py`: `parse_generation_params()` normalizes them; `generate_handwriting_to_file()` dispatches.
+3. Text is normalized to the model alphabet. In **non-chunked** mode it is also wrapped to the page width by `text_processor.py` (`TextProcessor`, via `text_utils.wrap_by_canvas`).
+4. `Hand.write()` (one RNN sample per line) or `Hand.write_chunked()` (the default) runs. Chunked mode splits each line into small chunks, samples each, and stitches them into lines using **measured** stroke widths — better line filling and shorter RNN sequences. The stages map to `operations/`: `chunking.py` (text → chunks), `sampling.py` (RNN inference), `stroke_ops.py` (stitch / baseline / rotation).
+5. `handwriting_synthesis/hand/_draw.py: _draw()` does all page layout: unit conversion (`PX_PER_MM = 96/25.4`), auto-sizing strokes to fit the content box, alignment, line-height, baseline/margin jitter, and SVG emission via `svgwrite`.
+
+Chunked vs non-chunked is the `use_chunked` flag (default true). Generation defaults (chunking strategy, words/chars per chunk, page, style/bias) live in `config.json` under `defaults`.
+
+## Character overrides ("character insert")
+
+Users upload custom SVG glyphs for specific characters that get injected into otherwise model-generated handwriting. Persisted as `CharacterOverride` / `CharacterOverrideCollection` (`webapp/models.py`); helpers in `handwriting_synthesis/hand/character_override_utils.py`; rendered in `_draw.py`'s override path. The current approach generates the line with placeholder spaces, then uses the model's attention `char_indices` to cut the strokes precisely and shift them to open a gap for the inserted glyph (`_render_strokes_with_overrides`, `override_positions`). Override characters are exempt from alphabet validation. This subsystem is intricate and changes often — trace the `char_indices` / `override_positions` flow end-to-end before modifying it. Uploaded SVG is untrusted input.
+
+## Web app layout
+
+- Entry: `webapp/app.py` (`webapp.app:app`); Flask extensions in `webapp/extensions.py`; SQLAlchemy models in `webapp/models.py` (User, CharacterOverride(Collection), BatchJob, PageSize/TemplatePreset, Usage/Activity).
+- Routes are split by concern under `webapp/routes/`; reusable logic lives in `webapp/utils/` (`generation_utils`, `text_utils`, `page_utils`, `secure_urls`, `auth_utils`).
+- Async/batch work goes through Celery (`webapp/celery_app.py`, `webapp/tasks.py`, Redis broker) and the `BatchJob` model.
+- Runtime config comes from env (`DATABASE_URL`, `REDIS_URL`, `SECRET_KEY`, Sentry, mail) — see `.env.example`.
+
+## Gotchas
+
+- Page geometry is computed in pixels internally but user-facing values may be mm or px (`units`). `PX_PER_MM` and the paper-size table are defined in **both** `_draw.py` and `webapp/utils/page_utils.py` — keep them consistent.
+- `legibility` (`high` | `normal` | `natural`) sets jitter/interpolation defaults in `_draw.py`; `high` disables all randomness, which is what you want for deterministic output or tests.
+- The RNN/TF code (`handwriting_synthesis/{rnn,tf}/`) uses graph-mode `tf.compat.v1` and legacy Keras (`tf-keras`, `TF_USE_LEGACY_KERAS=1`); it is not idiomatic TF2.
diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 9a8b482..69ca863 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -76,6 +76,7 @@ def write(
         empty_line_spacing=None,
         auto_size=True,
         manual_size_scale=1.0,
+        writing_size_mm=None,
         character_override_collection_id=None,
         margin_jitter_frac=None,
         margin_jitter_coherence=None,
@@ -274,6 +275,7 @@ def _normalize_seq(value, desired_len, cast_fn=None, name='param'):
             empty_line_spacing=empty_line_spacing,
             auto_size=auto_size,
             manual_size_scale=manual_size_scale,
+            writing_size_mm=writing_size_mm,
             character_override_collection_id=character_override_collection_id,
             overrides_dict=overrides_dict,
             margin_jitter_frac=margin_jitter_frac,
@@ -302,6 +304,63 @@ def _sample(self, lines, biases=None, styles=None, return_char_indices=False):
             return_char_indices=return_char_indices
         )
 
+    def _size_aware_max_line_width(
+        self, chunk_strokes, max_line_width, page_size, units, margins,
+        orientation, writing_size_mm, x_stretch, auto_size,
+    ):
+        """Cap the wrap width so a full line still renders at the natural size.
+
+        The chunked wrapper measures widths in raw model units, but ``_draw``
+        renders in page pixels. If lines are allowed to grow to ``max_line_width``
+        raw units, a full line ends up wider than the page at the natural
+        x-height, so ``_draw`` shrinks ALL text to make it fit -- which is what
+        makes the handwriting come out small. Here we cap the wrap width to the
+        raw width that exactly fills the page's content box at the target
+        x-height, so letters keep their natural size and a line simply holds
+        fewer words (wrapping to the next line instead of shrinking).
+
+        Returns the (possibly reduced) wrap width in raw units. Only applies when
+        auto-sizing; returns ``max_line_width`` unchanged otherwise.
+        """
+        if not auto_size or not chunk_strokes:
+            return max_line_width
+        try:
+            from handwriting_synthesis.hand._draw import (
+                _resolve_page_size, _normalize_margins, _estimate_xheight,
+                PX_PER_MM, NATURAL_WRITING_SIZE_MM, WRAP_SIZE_CALIBRATION,
+            )
+
+            width_px, height_px, _ = _resolve_page_size(page_size, units, 1, 60.0)
+            if orientation == 'landscape':
+                width_px, height_px = height_px, width_px
+            _, m_right, _, m_left = _normalize_margins(margins, units)
+            content_width_px = max(1.0, width_px - (m_left + m_right))
+
+            # Model x-height in raw units (robust median across the chunks).
+            xheights = []
+            for stroke in chunk_strokes:
+                if stroke is None or len(stroke) < 8:
+                    continue
+                xheights.append(_estimate_xheight(drawing.offsets_to_coords(stroke)))
+            if not xheights:
+                return max_line_width
+            model_xheight = float(np.median(xheights))
+
+            target_mm = NATURAL_WRITING_SIZE_MM if writing_size_mm is None else float(writing_size_mm)
+            target_xheight_px = max(1.0, target_mm * PX_PER_MM)
+            xs = float(x_stretch) if x_stretch else 1.0
+            if xs <= 0:
+                xs = 1.0
+
+            # Raw width whose rendered width == content width at the target size.
+            # WRAP_SIZE_CALIBRATION corrects for the per-chunk vs stitched-line
+            # x-height difference so the rendered size matches the requested one.
+            fit_raw = WRAP_SIZE_CALIBRATION * content_width_px * model_xheight / (target_xheight_px * xs)
+            return max(1.0, min(float(max_line_width), fit_raw))
+        except Exception as exc:  # never block generation on a sizing heuristic
+            print(f"Warning: size-aware wrap width failed, using max_line_width: {exc}")
+            return max_line_width
+
     def write_chunked(
         self,
         filename,
@@ -333,6 +392,7 @@ def write_chunked(
         empty_line_spacing=None,
         auto_size=True,
         manual_size_scale=1.0,
+        writing_size_mm=None,
         character_override_collection_id=None,
         margin_jitter_frac=None,
         margin_jitter_coherence=None,
@@ -469,6 +529,13 @@ def write_chunked(
 
                 print(f"DEBUG: Generated {len(modified_chunks)} chunks with char_indices")
 
+                # Wrap to the page at the natural size: cap line width so a full
+                # line renders at the target x-height instead of being shrunk.
+                effective_max_line_width = self._size_aware_max_line_width(
+                    chunk_strokes, max_line_width, page_size, units, margins,
+                    orientation, writing_size_mm, x_stretch, auto_size,
+                )
+
                 # STEP 6: Build segment data with override info for each chunk
                 # Stitch chunks into lines based on actual widths
                 current_line_stroke = np.empty((0, 3))
@@ -528,7 +595,7 @@ def write_chunked(
                         'override_positions': adjusted_overrides,  # [(adjusted_idx, char), ...] - ADJUSTED for style offset
                     }
 
-                    if potential_width <= max_line_width or current_line_width == 0:
+                    if potential_width <= effective_max_line_width or current_line_width == 0:
                         # Chunk fits on current line
                         if current_line_width > 0:
                             current_line_stroke = stitch_strokes(
@@ -605,6 +672,13 @@ def write_chunked(
                     styles=[styles] * len(chunks) if styles is not None else None
                 )
 
+                # Wrap to the page at the natural size: cap line width so a full
+                # line renders at the target x-height instead of being shrunk.
+                effective_max_line_width = self._size_aware_max_line_width(
+                    chunk_strokes, max_line_width, page_size, units, margins,
+                    orientation, writing_size_mm, x_stretch, auto_size,
+                )
+
                 # Stitch chunks into lines based on actual widths
                 current_line_stroke = np.empty((0, 3))
                 current_line_text = []
@@ -620,7 +694,7 @@ def write_chunked(
                     else:
                         potential_width = chunk_width
 
-                    if potential_width <= max_line_width or current_line_width == 0:
+                    if potential_width <= effective_max_line_width or current_line_width == 0:
                         # Chunk fits on current line
                         if current_line_width > 0:
                             current_line_stroke = stitch_strokes(
@@ -714,6 +788,7 @@ def _normalize_seq(value, desired_len, cast_fn=None, name='param'):
             empty_line_spacing=empty_line_spacing,
             auto_size=auto_size,
             manual_size_scale=manual_size_scale,
+            writing_size_mm=writing_size_mm,
             character_override_collection_id=character_override_collection_id,
             overrides_dict=overrides_dict,
             margin_jitter_frac=margin_jitter_frac,
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 8f9fa6c..1408076 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -16,6 +16,48 @@
     'Legal': (215.9, 355.6),
 }
 
+# --- Natural handwriting sizing -------------------------------------------------
+# Auto-sizing targets a physical x-height (the height of lowercase letters such as
+# a / e / o) rather than fitting the worst-case stroke extent. ~4.5 mm matches
+# normal ballpoint handwriting. Override per call with writing_size_mm.
+NATURAL_WRITING_SIZE_MM = 4.5
+# Auto line advance as a multiple of the rendered x-height. ~2.1x leaves room for
+# ascenders/descenders without large gaps (natural single spacing).
+LINE_SPACING_PER_XHEIGHT = 2.1
+# Width clamp fits every line up to this multiple of the median line width; lines
+# wider than that are treated as outliers (e.g. an unwrapped long token) and are
+# condensed per-line at render time instead of shrinking the whole document.
+WIDTH_OUTLIER_FACTOR = 2.0
+
+# Empirical correction for the size-aware wrap width (Hand.write_chunked). The wrap
+# width is derived from per-chunk x-heights, but _draw renders stitched, aligned,
+# de-noised lines whose measured x-height is a bit smaller, so without this lines
+# would render ~20% under the requested size. Calibrated against real model output.
+WRAP_SIZE_CALIBRATION = 0.82
+
+
+def _estimate_xheight(ls):
+    """Estimate the x-height (lowercase body height) of an aligned stroke array.
+
+    ``ls`` is in the normalised layout space produced in the first pass (y in
+    ``[0, raw_h]`` with the baseline near the bottom). Letter bodies form a dense
+    central band while ascenders (l, h, k) and descenders (g, y, p) are a small
+    fraction of the points. Taking the 10th..90th percentile span of the y values
+    yields a body-height estimate that stays stable regardless of which letters
+    happen to appear -- unlike the raw maximum extent, which a single tall stroke
+    inflates and which is the reason the previous logic shrank text inconsistently.
+
+    Returns the band height (a positive float); falls back to the full extent for
+    very short stroke arrays.
+    """
+    if ls.shape[0] < 8:
+        return max(1e-6, float(ls[:, 1].max()) if ls.shape[0] else 1e-6)
+    ys = ls[:, 1]
+    band = float(np.percentile(ys, 90.0) - np.percentile(ys, 10.0))
+    if band <= 1e-6:
+        band = max(1e-6, float(ys.max()))
+    return band
+
 
 def _extract_svg_coordinates(d_string):
     """
@@ -547,6 +589,7 @@ def _draw(
     empty_line_spacing=None,
     auto_size=True,
     manual_size_scale=1.0,
+    writing_size_mm=None,  # Target x-height in mm for natural sizing (None -> NATURAL_WRITING_SIZE_MM)
     character_override_collection_id=None,
     overrides_dict=None,  # New parameter
     margin_jitter_frac=None,  # Bi-directional left margin jitter (fraction of content width)
@@ -606,10 +649,12 @@ def _draw(
     content_width_px = max(1.0, width_px - (m_left + m_right))
     content_height_px = max(1.0, height_px - (m_top + m_bottom))
 
-    line_height_px = _to_px(line_height, units) if line_height is not None else default_line_height_px
-    # Ensure all lines fit vertically
-    max_line_height_px = content_height_px / max(1, len(line_segments) + 0)
-    line_height_px = min(line_height_px, max_line_height_px)
+    # Requested line advance (px). When None we derive a natural one from the text
+    # size below. The previous content_height/num_lines cram is gone: keeping the
+    # output on one page is handled once, after sizing, by scaling size + spacing
+    # together (see the sizing block) so spacing always tracks the letter size.
+    line_height_given = line_height is not None
+    line_height_px = _to_px(line_height, units) if line_height_given else default_line_height_px
 
     # Empty line spacing: if not specified, use regular line_height_px
     empty_line_spacing_px = _to_px(empty_line_spacing, units) if empty_line_spacing is not None else line_height_px
@@ -648,10 +693,14 @@ def _draw(
     if margin_jitter_coherence is None:
         margin_jitter_coherence = {'high': 0.0, 'normal': 0.4}.get(legibility, 0.3)
 
-    # First pass: preprocess each line and compute per-line max allowed scale
+    # First pass: preprocess each line, measuring a robust per-segment x-height
+    # (drives the natural text size) and per-line widths (drives the width clamp).
+    # target_h is only a stable REFERENCE for override scaling -- it cancels out of
+    # the override width math, so its exact value does not affect the output.
     preprocessed_lines = []
-    scale_limits = []
-    raw_heights = []  # Track raw heights for computing average
+    raw_heights = []       # full stroke extents, used for override size matching
+    xheights = []          # robust body heights, used to pick the natural text size
+    line_raw_widths = []   # summed generated raw width per line, for the width clamp
     target_h = 0.95 * line_height_px
 
     for line_idx, segment_list in enumerate(line_segments):
@@ -662,6 +711,7 @@ def _draw(
         preprocessed_segments = []
         color = stroke_colors[line_idx]
         width = stroke_widths[line_idx]
+        line_gen_raw_w = 0.0  # accumulated generated stroke width for this line
 
         for segment in segment_list:
             if segment['type'] == 'override':
@@ -725,18 +775,15 @@ def _draw(
                 ls[:, :2] -= min_xy
                 raw_w = max(1e-6, ls[:, 0].max())
                 raw_h = max(1e-6, ls[:, 1].max())
-                s_w = content_width_px / raw_w
-                s_h = target_h / raw_h
-                scale_limits.append(min(s_w, s_h))
-                raw_heights.append(raw_h)  # Track for average calculation
-
-                # DEBUG: Log preprocessing values
-                print(f"DEBUG preprocess: text='{segment.get('text', '')[:20]}', raw_h={raw_h:.2f}, s_h={s_h:.4f}, s_w={s_w:.4f}, has_overrides={has_overrides}")
+                raw_heights.append(raw_h)          # full extent (override matching)
+                xheights.append(_estimate_xheight(ls))  # robust body height (sizing)
+                line_gen_raw_w += raw_w
 
                 preprocessed_segments.append({
                     'type': 'generated',
                     'strokes': ls,
                     'raw_h': raw_h,  # Store for adjacent override sizing
+                    'raw_w': raw_w,  # cached so the width pass need not re-measure
                     'color': color,
                     'width': width,
                     'text': segment.get('text', ''),  # Add original text for spacing checks
@@ -744,61 +791,74 @@ def _draw(
                     'char_indices': segment_char_indices  # Character indices (preserved for override segments)
                 })
 
+        if line_gen_raw_w > 0:
+            line_raw_widths.append(line_gen_raw_w)
         preprocessed_lines.append(preprocessed_segments if preprocessed_segments else [{'empty': True}])
 
-    # Determine global scale: automatic or manual
+    # ---- Choose the natural handwriting size and line spacing -----------------
+    #
+    # Size is driven by a robust x-height target that is the SAME for every line,
+    # so a single tall or wide line no longer shrinks the whole document. Width is
+    # respected via a percentile clamp (most lines fit the page; the few widest are
+    # condensed slightly per line at render time). Vertically we keep one page by
+    # scaling the text AND the spacing down together when the lines would not fit.
+    x_stretch = float(x_stretch) if x_stretch is not None else 1.0
+    if x_stretch <= 0:
+        x_stretch = 1.0
+
+    writing_mm = NATURAL_WRITING_SIZE_MM if writing_size_mm is None else float(writing_size_mm)
+    target_xheight_px = max(1.0, writing_mm * PX_PER_MM)
+
+    typical_xheight = float(np.median(xheights)) if xheights else target_xheight_px
+    size_scale = target_xheight_px / max(1e-6, typical_xheight)
+
     if auto_size:
-        s_global = min(scale_limits) if scale_limits else 1.0
+        s_global = size_scale
+        # Width clamp: fit every NORMAL line within the page, ignoring gross
+        # outliers (a single unwrapped long line is condensed per-line at render
+        # time via line_scale_x instead of shrinking every line -- which is what
+        # used to make the text tiny).
+        if line_raw_widths:
+            median_w = float(np.median(line_raw_widths))
+            normal_widths = [w for w in line_raw_widths if w <= WIDTH_OUTLIER_FACTOR * median_w]
+            width_ref = max(normal_widths) if normal_widths else median_w
+            if width_ref > 1e-6:
+                s_global = min(s_global, content_width_px / (width_ref * x_stretch))
     else:
-        s_global = float(manual_size_scale)
+        # manual_size_scale is now a multiple of the natural size (1.0 == natural).
+        s_global = float(manual_size_scale) * size_scale
 
-    # Compute effective target height for overrides based on actual generated text height
-    # This ensures overrides match the size of surrounding generated text
-    avg_raw_h = sum(raw_heights) / len(raw_heights) if raw_heights else target_h
-    effective_target_h = avg_raw_h * s_global
+    # Rendered x-height after the width clamp, used to derive natural line spacing.
+    rendered_xheight = typical_xheight * s_global
+
+    # Line advance: honour an explicit line_height, otherwise derive one from the
+    # rendered x-height so spacing always tracks the letter size.
+    line_advance_px = line_height_px if line_height_given else LINE_SPACING_PER_XHEIGHT * rendered_xheight
 
-    # DEBUG: Log key scaling values
-    has_overrides = bool(overrides_dict)
-    print(f"DEBUG _draw: overrides={'ENABLED' if has_overrides else 'DISABLED'}, target_h={target_h:.2f}, s_global={s_global:.4f}, avg_raw_h={avg_raw_h:.2f}, effective_target_h={effective_target_h:.2f}")
-
-    # BUGFIX: For small pages where auto_size significantly reduces text scale,
-    # adjust line height to be proportional to the actual rendered text size.
-    # This prevents huge line spacing when text is scaled down to fit narrow pages.
-    if auto_size and scale_limits:
-        # Calculate what the text height would have been without width constraint
-        # scale_limits contains min(s_w, s_h) for each line, where s_h = target_h / raw_h
-        # If s_global is much smaller than what s_h alone would give, text is width-constrained
-        # In that case, effective line height should scale down proportionally
-
-        # Recalculate scale limits considering only height (not width)
-        height_only_scales = []
-        for preprocessed_segments in preprocessed_lines:
-            for segment in preprocessed_segments:
-                if segment.get('type') == 'generated' and 'strokes' in segment:
-                    ls = segment['strokes']
-                    raw_h = max(1e-6, ls[:, 1].max())
-                    s_h = target_h / raw_h
-                    height_only_scales.append(s_h)
-                    break
-
-        if height_only_scales:
-            # The ideal scale based on height alone
-            ideal_height_scale = min(height_only_scales)
-            # If actual scale is significantly smaller (width-constrained), reduce line height
-            if s_global < ideal_height_scale * 0.95:  # Allow 5% tolerance
-                scale_ratio = s_global / ideal_height_scale
-                # Adjust line height proportionally, but keep some minimum spacing
-                adjusted_line_height = line_height_px * scale_ratio
-                # Ensure minimum spacing of at least 20% of original to prevent overlapping
-                line_height_px = max(adjusted_line_height, line_height_px * 0.2)
-                # Also adjust empty line spacing if it was based on line_height_px
-                if empty_line_spacing is None:
-                    empty_line_spacing_px = line_height_px
+    # Keep everything on one page (auto-size only): if the lines would not fit the
+    # content height, scale the size and the spacing down by the same factor.
+    if auto_size:
+        n_rows = max(1, len(preprocessed_lines))
+        needed_height = line_advance_px * (n_rows + 1.0)  # first-line offset + descender slack
+        if needed_height > content_height_px:
+            vfit = content_height_px / needed_height
+            s_global *= vfit
+            line_advance_px *= vfit
+            rendered_xheight *= vfit
+
+    line_height_px = max(1.0, line_advance_px)
+    if empty_line_spacing is None:
+        empty_line_spacing_px = line_height_px
+
+    # Override sizing reference: overrides are sized to neighbouring generated text
+    # via raw_h * s_global; target_h cancels out of the override width math, so its
+    # exact value does not matter as long as it is used consistently.
+    avg_raw_h = sum(raw_heights) / len(raw_heights) if raw_heights else 1.0
+    effective_target_h = avg_raw_h * s_global
 
     # Second pass: render with uniform scale across lines for consistent letter size
     cursor_y = m_top + (3.0 * line_height_px / 4.0)
     rng = np.random.RandomState(42)
-    x_stretch = float(x_stretch) if x_stretch is not None else 1.0
 
     # Pre-generate bi-directional margin jitter for all lines (Gaussian + coherence smoothing)
     num_lines = len(preprocessed_lines)
diff --git a/tests/test_sizing.py b/tests/test_sizing.py
new file mode 100644
index 0000000..346307b
--- /dev/null
+++ b/tests/test_sizing.py
@@ -0,0 +1,172 @@
+"""Dimensional tests for the natural-handwriting sizing in `_draw`.
+
+Model-free: synthetic stroke arrays with a known body height are fed through
+`_draw`, then the rendered SVG is parsed and measured in page pixels. These pin
+down the *behaviour* of the sizing/spacing logic (consistent x-height, spacing
+proportional to size, width does not shrink everything, shrink-to-fit-one-page,
+manual scale as a multiple of natural) without judging visual "naturalness".
+
+Run: `python tests/test_sizing.py` or `pytest tests/test_sizing.py`.
+"""
+
+import os
+import re
+import sys
+import tempfile
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)))
+
+from handwriting_synthesis import drawing
+from handwriting_synthesis.hand import _draw as draw_mod
+from handwriting_synthesis.hand._draw import _draw, PX_PER_MM, NATURAL_WRITING_SIZE_MM
+
+_COORD = re.compile(r'[ML]\s*([-\d.]+)[\s,]+([-\d.]+)')
+
+
+def _segment(text, width_units=180.0, xheight=20.0, ascender_frac=0.0, n=240, line_idx=0):
+    """A synthetic generated segment.
+
+    Most points sit in the body band [0, xheight]; a fraction are pushed up to
+    ~2x xheight to emulate ascenders, so the 10..90 percentile band ~= xheight.
+    """
+    xs = np.linspace(0.0, width_units, n)
+    ys = np.abs(np.sin(np.linspace(0.0, 9 * np.pi, n))) * xheight  # body in [0, xheight]
+    if ascender_frac > 0:
+        k = max(1, int(n * ascender_frac))
+        ys[np.linspace(0, n - 1, k).astype(int)] = 2.0 * xheight  # occasional tall strokes
+    coords = np.stack([xs, ys, np.zeros(n)], axis=1)
+    coords[-1, 2] = 1.0
+    return {'type': 'generated', 'text': text, 'strokes': drawing.coords_to_offsets(coords),
+            'line_idx': line_idx}
+
+
+def _render(line_segments, page=(210.0, 297.0), margins=20.0, **kw):
+    fd, path = tempfile.mkstemp(suffix='.svg')
+    os.close(fd)
+    opts = dict(page_size=list(page), units='mm', margins=margins, align='left',
+                legibility='high', denoise=False, auto_size=True, background='white')
+    opts.update(kw)
+    _draw(line_segments, [s[0].get('text', '') for s in line_segments], path, **opts)
+    return path
+
+
+def _line_y_bands(path):
+    """Return, per top-level path, (min_y, max_y, p10, p90) of its point y-coords."""
+    root = ET.parse(path).getroot()
+    bands = []
+    for child in root:
+        if child.tag.split('}')[-1] != 'path':
+            continue
+        ys = [float(y) for _, y in _COORD.findall(child.get('d', ''))]
+        if ys:
+            a = np.array(ys)
+            bands.append((a.min(), a.max(), float(np.percentile(a, 10)), float(np.percentile(a, 90))))
+    return bands
+
+
+def _rendered_xheight_mm(path):
+    """Median rendered 10..90 body band across lines, in mm."""
+    bands = _line_y_bands(path)
+    assert bands, "no generated paths in output"
+    spans_px = [p90 - p10 for (_, _, p10, p90) in bands]
+    return float(np.median(spans_px)) / PX_PER_MM
+
+
+def _baseline_spacing_mm(path):
+    """Median gap between consecutive lines' bottoms (a baseline proxy), in mm."""
+    bands = sorted(_line_y_bands(path), key=lambda b: b[1])
+    bottoms = [b[1] for b in bands]
+    if len(bottoms) < 2:
+        return None
+    gaps = np.diff(bottoms)
+    return float(np.median(gaps)) / PX_PER_MM
+
+
+def test_default_xheight_is_natural():
+    """Default auto-size renders a body x-height ~= NATURAL_WRITING_SIZE_MM."""
+    segs = [[_segment("hello world", line_idx=i)] for i in range(4)]
+    xh = _rendered_xheight_mm(_render(segs))
+    assert abs(xh - NATURAL_WRITING_SIZE_MM) <= 0.9, xh
+
+
+def test_writing_size_mm_controls_size():
+    """Rendered x-height tracks the writing_size_mm knob (and bigger = bigger)."""
+    segs = [[_segment("hello world", line_idx=i)] for i in range(4)]
+    small = _rendered_xheight_mm(_render(segs, writing_size_mm=3.0))
+    big = _rendered_xheight_mm(_render(segs, writing_size_mm=6.0))
+    assert abs(small - 3.0) <= 0.8, small
+    assert abs(big - 6.0) <= 1.0, big
+    assert big > small + 1.5
+
+
+def test_xheight_consistent_despite_ascenders():
+    """A line full of ascenders must not shrink the document's x-height.
+
+    This is the core fix: previously the tallest line set a global min scale that
+    shrank everything; now sizing uses a robust body height per line.
+    """
+    no_asc = [[_segment("aaa eee ooo", ascender_frac=0.0, line_idx=i)] for i in range(4)]
+    with_asc = [[_segment("aaa eee ooo", ascender_frac=0.0, line_idx=0)],
+                [_segment("llll kkkk hhhh", ascender_frac=0.30, line_idx=1)],
+                [_segment("aaa eee ooo", ascender_frac=0.0, line_idx=2)],
+                [_segment("aaa eee ooo", ascender_frac=0.0, line_idx=3)]]
+    xh_plain = _rendered_xheight_mm(_render(no_asc))
+    xh_mixed = _rendered_xheight_mm(_render(with_asc))
+    assert abs(xh_plain - xh_mixed) <= 0.7, (xh_plain, xh_mixed)
+
+
+def test_one_wide_line_does_not_shrink_others():
+    """A single very long line must not shrink the size of the normal lines."""
+    normal = [[_segment("hello", width_units=120.0, line_idx=i)] for i in range(5)]
+    with_wide = [[_segment("hello", width_units=120.0, line_idx=0)],
+                 [_segment("x" * 50, width_units=2000.0, line_idx=1)]]  # one huge outlier
+    with_wide += [[_segment("hello", width_units=120.0, line_idx=i)] for i in range(2, 5)]
+    xh_normal = _rendered_xheight_mm(_render(normal))
+    xh_wide = _rendered_xheight_mm(_render(with_wide))
+    # The outlier is condensed per-line, not allowed to shrink everyone.
+    assert xh_wide >= xh_normal - 0.6, (xh_normal, xh_wide)
+
+
+def test_spacing_tracks_size_when_auto():
+    """With auto line height, baseline spacing scales with the writing size."""
+    segs = [[_segment("hello world", line_idx=i)] for i in range(5)]
+    sp_small = _baseline_spacing_mm(_render(segs, writing_size_mm=3.0))
+    sp_big = _baseline_spacing_mm(_render(segs, writing_size_mm=6.0))
+    assert sp_small and sp_big and sp_big > sp_small + 2.0, (sp_small, sp_big)
+
+
+def test_shrinks_to_fit_one_page():
+    """Many lines on a short page shrink (size + spacing) to stay on one page."""
+    many = [[_segment("hello world", line_idx=i)] for i in range(60)]
+    path = _render(many, page=(210.0, 297.0), margins=20.0, writing_size_mm=5.0)
+    bands = _line_y_bands(path)
+    max_y_px = max(b[1] for b in bands)
+    page_h_px = 297.0 * PX_PER_MM
+    assert max_y_px <= page_h_px + 2.0, (max_y_px, page_h_px)            # stays on page
+    assert _rendered_xheight_mm(path) < 5.0                              # was scaled down
+
+
+def test_manual_scale_is_multiple_of_natural():
+    """auto_size=False: manual_size_scale=2 renders ~2x the natural size."""
+    segs = [[_segment("hello world", line_idx=i)] for i in range(3)]
+    natural = _rendered_xheight_mm(_render(segs, auto_size=True))
+    doubled = _rendered_xheight_mm(_render(segs, auto_size=False, manual_size_scale=2.0))
+    assert doubled > natural * 1.6, (natural, doubled)
+
+
+if __name__ == '__main__':
+    tests = [v for k, v in sorted(globals().items())
+             if k.startswith('test_') and callable(v)]
+    failures = 0
+    for fn in tests:
+        try:
+            fn()
+            print(f"PASS  {fn.__name__}")
+        except Exception as exc:  # noqa: BLE001
+            failures += 1
+            print(f"FAIL  {fn.__name__}: {type(exc).__name__}: {exc}")
+    print(f"\n{len(tests) - failures}/{len(tests)} passed")
+    sys.exit(1 if failures else 0)
diff --git a/webapp/init_db.py b/webapp/init_db.py
index c101c7f..e230cf0 100644
--- a/webapp/init_db.py
+++ b/webapp/init_db.py
@@ -6,6 +6,7 @@
 """
 import os
 import sys
+from datetime import datetime
 from getpass import getpass
 import warnings
 
@@ -41,30 +42,117 @@ def get_password_input(prompt="Password: "):
         return input(prompt).strip()
 
 
-def init_database():
+def _placeholder_for(column):
+    """Return a safe non-null backfill value for a newly-added NOT NULL column."""
+    from sqlalchemy import Integer, Numeric, Float, Boolean, DateTime, Date
+    col_type = column.type
+    if isinstance(col_type, (Integer, Numeric, Float)):
+        return 0
+    if isinstance(col_type, Boolean):
+        return False
+    if isinstance(col_type, (DateTime, Date)):
+        return datetime.utcnow()
+    return ''  # strings/text and anything else
+
+
+def _reconcile_missing_columns():
+    """Add columns present in the models but missing from existing tables.
+
+    ``db.create_all()`` creates new tables but never ALTERs existing ones, so a DB
+    created against older models is left missing newly-added columns -- which is
+    exactly how ``users.email`` went missing and made every page 500. For each
+    existing table we add any missing column (NOT NULL columns are backfilled so
+    the ALTER succeeds on populated tables; unique columns get a unique index when
+    the current values allow it). Column drops / renames / type changes are NOT
+    handled here -- those need a real Alembic migration.
     """
-    Initialize the database tables and run migrations.
+    from sqlalchemy import inspect, text
+
+    inspector = inspect(db.engine)
+    existing_tables = set(inspector.get_table_names())
+    added = []
+
+    for table in db.metadata.sorted_tables:
+        if table.name not in existing_tables:
+            continue  # brand-new table: db.create_all() already created it
+        db_cols = {c['name'] for c in inspector.get_columns(table.name)}
+        for column in table.columns:
+            if column.name in db_cols:
+                continue
+            col_type = column.type.compile(dialect=db.engine.dialect)
+            with db.engine.begin() as conn:
+                conn.execute(text(
+                    f'ALTER TABLE "{table.name}" ADD COLUMN "{column.name}" {col_type}'))
+                if not column.nullable:
+                    conn.execute(
+                        text(f'UPDATE "{table.name}" SET "{column.name}" = :val '
+                             f'WHERE "{column.name}" IS NULL'),
+                        {"val": _placeholder_for(column)})
+                if column.unique:
+                    dupes = conn.execute(text(
+                        f'SELECT COUNT(*) - COUNT(DISTINCT "{column.name}") '
+                        f'FROM "{table.name}"')).scalar()
+                    if not dupes:
+                        conn.execute(text(
+                            f'CREATE UNIQUE INDEX IF NOT EXISTS '
+                            f'"ix_{table.name}_{column.name}" '
+                            f'ON "{table.name}" ("{column.name}")'))
+                    else:
+                        print(f"  [WARN] added {table.name}.{column.name} but left it "
+                              f"non-unique: existing rows have blank/duplicate values; "
+                              f"set them and add a unique index manually.")
+            added.append(f"{table.name}.{column.name}")
+
+    if added:
+        print(f"  Added missing columns: {', '.join(added)}")
+    else:
+        print("  Schema already matches models (no missing columns).")
+    return added
+
 
-    Attempts to run Alembic migrations first. If that fails (e.g., first run),
-    falls back to SQLAlchemy's `db.create_all()`.
+def init_database():
+    """
+    Bring the database schema up to date from any starting state.
+
+    Handles all three cases the app can encounter:
+      * Fresh DB, or a legacy DB created by db.create_all() with no Alembic stamp:
+        build the schema directly from the models (creating missing tables AND
+        adding columns missing from existing tables), then stamp Alembic head so
+        future `flask db upgrade` works.
+      * Alembic-managed DB: apply any pending migrations with `upgrade head`.
+
+    The previous version ran migrations first and fell back to db.create_all() on
+    error, which could not ALTER existing tables and silently left the schema out
+    of date (the users.email outage).
     """
     with app.app_context():
-        print("Running database migrations...")
-        from alembic.config import Config
-        from alembic import command
-
-        # Get the alembic config
-        alembic_cfg = Config(os.path.join(PROJECT_ROOT, "alembic.ini"))
-
-        try:
-            # Run all pending migrations
-            command.upgrade(alembic_cfg, "head")
-            print("Database migrations completed successfully!")
-        except Exception as e:
-            print(f"Error running migrations: {e}")
-            print("\nFalling back to db.create_all()...")
-            db.create_all()
-            print("Database tables created successfully!")
+        # Use Flask-Migrate's helpers (not a hand-built alembic Config): they use
+        # the Migrate extension's configured migrations/ directory. The old code
+        # pointed Config at webapp/alembic.ini -> webapp/alembic/env.py, which does
+        # not exist, so every `upgrade` failed and silently fell back to
+        # create_all() -- the reason the schema drifted (users.email outage).
+        from flask_migrate import upgrade as fm_upgrade, stamp as fm_stamp
+        from alembic.runtime.migration import MigrationContext
+
+        with db.engine.connect() as conn:
+            current_rev = MigrationContext.configure(conn).get_current_revision()
+
+        if current_rev is None:
+            print("No Alembic revision found - syncing schema directly from models...")
+            db.create_all()                # create any missing tables
+            _reconcile_missing_columns()   # add columns missing from existing tables
+            fm_stamp(revision="head")      # mark as current so future upgrades work
+            print("Schema synced from models and stamped to Alembic head.")
+        else:
+            print(f"Alembic revision {current_rev} - applying any pending migrations...")
+            try:
+                fm_upgrade()               # to head
+                print("Database is at Alembic head.")
+            except Exception as e:
+                print(f"Error applying migrations: {e}")
+                print("Falling back to model-based schema sync...")
+                db.create_all()
+                _reconcile_missing_columns()
 
 
 def create_admin_user():
diff --git a/webapp/static/js/modules/alpine-app.js b/webapp/static/js/modules/alpine-app.js
index 0a3ff72..1bb8611 100644
--- a/webapp/static/js/modules/alpine-app.js
+++ b/webapp/static/js/modules/alpine-app.js
@@ -40,6 +40,7 @@ document.addEventListener('alpine:init', () => {
     globalScale: '',
     autoSize: true,
     manualSizeScale: '',
+    writingSizeMm: '',  // target x-height in mm (natural handwriting size; blank = engine default ~4.5)
 
     // Custom size
     pageWidth: '',
@@ -282,6 +283,7 @@ document.addEventListener('alpine:init', () => {
         empty_line_spacing: this.emptyLineSpacing ? Number(this.emptyLineSpacing) : undefined,
         auto_size: this.autoSize,
         manual_size_scale: (!this.autoSize && this.manualSizeScale) ? Number(this.manualSizeScale) : undefined,
+        writing_size_mm: this.writingSizeMm ? Number(this.writingSizeMm) : undefined,
         use_chunked: this.useChunked,
         adaptive_chunking: this.adaptiveChunking,
         adaptive_strategy: this.adaptiveStrategy || undefined,
@@ -651,6 +653,7 @@ document.addEventListener('alpine:init', () => {
       formData.append('global_scale', this.globalScale || '');
       formData.append('auto_size', this.autoSize ? 'true' : 'false');
       formData.append('manual_size_scale', this.manualSizeScale || '');
+      formData.append('writing_size_mm', this.writingSizeMm || '');
       formData.append('biases', this.biases || '');
       formData.append('stroke_colors', this.strokeColors || '');
       formData.append('stroke_widths', this.strokeWidths || '');
@@ -808,6 +811,7 @@ document.addEventListener('alpine:init', () => {
           global_scale: this.globalScale || null,
           auto_size: this.autoSize,
           manual_size_scale: this.manualSizeScale || null,
+          writing_size_mm: this.writingSizeMm || null,
           biases: this.biases || null,
           stroke_colors: this.strokeColors || null,
           stroke_widths: this.strokeWidths || null,
diff --git a/webapp/templates/index.html b/webapp/templates/index.html
index 63655d5..2590284 100644
--- a/webapp/templates/index.html
+++ b/webapp/templates/index.html
@@ -372,18 +372,27 @@ <h3 class="card-title">Page Settings</h3>
                   <label class="flex-label">
                     <input id="autoSize" type="checkbox" x-model="autoSize" class="checkbox-auto" />
                     <span class="bx--label label-with-tooltip m-0">
-                      Auto Size (fit to line height)
+                      Auto Size (natural)
                       <span class="tooltip-icon">
-                        <span class="tooltip-text">Automatically adjusts text size to fit the line height. Recommended for consistent results.</span>
+                        <span class="tooltip-text">Sizes the handwriting to a natural x-height (set by Writing Size) and keeps it on one page. Recommended.</span>
                       </span>
                     </span>
                   </label>
                 </div>
+                <div class="form-group">
+                  <label for="writingSizeMm" class="bx--label label-with-tooltip">
+                    Writing Size (mm)
+                    <span class="tooltip-icon">
+                      <span class="tooltip-text">Target x-height (height of lowercase letters like a, e, o) in millimetres. ~4.5mm is normal pen handwriting; increase for larger writing. Used when Auto Size is on.</span>
+                    </span>
+                  </label>
+                  <input id="writingSizeMm" class="bx--text-input" type="number" step="0.5" min="1" placeholder="4.5" x-model="writingSizeMm" :disabled="!autoSize" />
+                </div>
                 <div class="form-group">
                   <label for="manualSizeScale" class="bx--label label-with-tooltip">
                     Manual Size Scale
                     <span class="tooltip-icon">
-                      <span class="tooltip-text">Manual text size when Auto Size is disabled. 1.0 = normal, 2.0 = double, 0.5 = half.</span>
+                      <span class="tooltip-text">Used when Auto Size is off: a multiple of the natural size. 1.0 = natural, 2.0 = double, 0.5 = half.</span>
                     </span>
                   </label>
                   <input id="manualSizeScale" class="bx--text-input" type="number" step="0.1" placeholder="1.0" x-model="manualSizeScale" :disabled="autoSize" />
diff --git a/webapp/utils/generation_utils.py b/webapp/utils/generation_utils.py
index c6f21fe..aaf48df 100644
--- a/webapp/utils/generation_utils.py
+++ b/webapp/utils/generation_utils.py
@@ -117,6 +117,8 @@ def _parse_int(val, default=None):
     empty_line_spacing = _parse_float(_get("empty_line_spacing"))
     auto_size = _parse_bool(_get("auto_size", "true"), True)
     manual_size_scale = _parse_float(_get("manual_size_scale"), 1.0)
+    # Target x-height in mm for natural sizing (None -> engine default ~4.5mm)
+    writing_size_mm = _parse_float(_get("writing_size_mm"))
 
     # Character overrides
     character_override_collection_id = _parse_int(_get("character_override_collection_id"))
@@ -165,6 +167,7 @@ def _parse_int(val, default=None):
         "empty_line_spacing": empty_line_spacing,
         "auto_size": auto_size,
         "manual_size_scale": manual_size_scale,
+        "writing_size_mm": writing_size_mm,
         "character_override_collection_id": character_override_collection_id,
         "wrap_char_px": wrap_char_px,
         "wrap_ratio": wrap_ratio,
@@ -282,6 +285,7 @@ def generate_handwriting_to_file(
             empty_line_spacing=params["empty_line_spacing"],
             auto_size=params["auto_size"],
             manual_size_scale=params["manual_size_scale"],
+            writing_size_mm=params["writing_size_mm"],
             character_override_collection_id=params["character_override_collection_id"],
             margin_jitter_frac=params["margin_jitter_frac"],
             margin_jitter_coherence=params["margin_jitter_coherence"],
@@ -336,6 +340,7 @@ def generate_handwriting_to_file(
             empty_line_spacing=params["empty_line_spacing"],
             auto_size=params["auto_size"],
             manual_size_scale=params["manual_size_scale"],
+            writing_size_mm=params["writing_size_mm"],
             character_override_collection_id=params["character_override_collection_id"],
             margin_jitter_frac=params["margin_jitter_frac"],
             margin_jitter_coherence=params["margin_jitter_coherence"],

From bde737b22a89109b862dcccc99f3d587617d16ba Mon Sep 17 00:00:00 2001
From: ariedotcodotnz <ariedotcodotnz@users.noreply.github.com>
Date: Wed, 10 Jun 2026 04:23:48 +0000
Subject: [PATCH 20/21] feat: add reflow text option for improved paragraph
 formatting and enhance writing size handling

---
 handwriting_synthesis/hand/Hand.py            | 336 +++++++++++++-----
 handwriting_synthesis/hand/_draw.py           |  84 ++++-
 .../hand/operations/__init__.py               |   3 +-
 .../hand/operations/chunking.py               |  60 +++-
 tests/test_operations.py                      |  47 ++-
 tests/test_sizing.py                          |  31 +-
 webapp/init_db.py                             |  41 ++-
 ...tebot.db.pre-email-fix.20260531_092148.bak | Bin 0 -> 122880 bytes
 webapp/static/js/modules/alpine-app.js        |   4 +
 webapp/templates/index.html                   |  20 +-
 webapp/utils/generation_utils.py              |  13 +-
 webapp/utils/text_utils.py                    |  28 ++
 12 files changed, 569 insertions(+), 98 deletions(-)
 create mode 100755 webapp/instance/writebot.db.pre-email-fix.20260531_092148.bak

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 69ca863..7f24a0f 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -19,6 +19,7 @@
     get_stroke_width,
     stitch_strokes,
     split_text_into_chunks,
+    balanced_line_breaks,
     sample_strokes,
 )
 
@@ -304,9 +305,182 @@ def _sample(self, lines, biases=None, styles=None, return_char_indices=False):
             return_char_indices=return_char_indices
         )
 
+    @staticmethod
+    def _measure_model_xheight(chunk_strokes):
+        """Median x-height of sampled chunks in raw model units.
+
+        Measured EXACTLY the way ``_draw`` measures it for sizing: deslant via
+        ``drawing.align``, then take the robust body band. Measuring post-align
+        (rather than on the raw slanted strokes) is what makes wrap-time
+        predictions match the rendered scale regardless of handwriting style.
+        Returns ``None`` when no chunk is measurable.
+        """
+        from handwriting_synthesis.hand._draw import _estimate_xheight
+        xheights = []
+        for stroke in chunk_strokes:
+            if stroke is None or len(stroke) < 8:
+                continue
+            coords = drawing.offsets_to_coords(stroke)
+            coords[:, :2] = drawing.align(coords[:, :2])
+            xheights.append(_estimate_xheight(coords))
+        return float(np.median(xheights)) if xheights else None
+
+    @staticmethod
+    def _estimate_stitched_xheight(sampled_lines, chunk_spacing, rotate_chunks, group_size=4):
+        """Measure stitched-line statistics: (x-height, width inflation factor).
+
+        ``_draw`` derives its render scale from the x-height of the final
+        STITCHED lines, which reads systematically taller than individual chunks
+        (baseline joins and residual drift widen the percentile band). Predicting
+        wrap widths from per-chunk measurements therefore left every rendered
+        line ~15-20% short of the right margin. Stitching chunks in groups of
+        ``group_size`` (about one final line's worth) with the same stitch
+        parameters reproduces the statistics ``_draw`` will actually see.
+
+        The width factor is how much wider a stitched line measures than the sum
+        of its chunk widths plus spacing (adaptive gaps and rotation correction
+        widen it, by a style-dependent amount). The wrap budget must be deflated
+        by it, otherwise the widest lines overrun the page and the global width
+        clamp trades the writing size away to fit them.
+
+        ``sampled_lines`` is the per-input-line list of ``None`` or
+        ``(chunks, strokes)`` built by ``write_chunked``; groups never span
+        input lines. Returns ``(None, 1.0)`` when nothing is measurable.
+        """
+        from handwriting_synthesis.hand._draw import _estimate_xheight
+        bands = []
+        width_factors = []
+        for entry in sampled_lines:
+            if not entry:
+                continue
+            strokes = [s for s in entry[1] if s is not None and len(s) >= 8]
+            for i in range(0, len(strokes), group_size):
+                group = strokes[i:i + group_size]
+                predicted_w = (sum(get_stroke_width(s) for s in group)
+                               + chunk_spacing * (len(group) - 1))
+                stitched = group[0]
+                for nxt in group[1:]:
+                    stitched = stitch_strokes(stitched, nxt, chunk_spacing,
+                                              rotate_to_match=rotate_chunks)
+                coords = drawing.offsets_to_coords(stitched)
+                coords[:, :2] = drawing.align(coords[:, :2])
+                bands.append(_estimate_xheight(coords))
+                actual_w = float(coords[:, 0].max() - coords[:, 0].min())
+                if len(group) > 1 and predicted_w > 1e-6:
+                    width_factors.append(actual_w / predicted_w)
+        if not bands:
+            return None, 1.0
+        factor = float(np.median(width_factors)) if width_factors else 1.0
+        return float(np.median(bands)), min(max(factor, 1.0), 1.3)
+
+    @staticmethod
+    def _content_box_px(page_size, units, margins, orientation):
+        """Resolve the page's content box (width_px, height_px) inside margins."""
+        from handwriting_synthesis.hand._draw import _resolve_page_size, _normalize_margins
+        width_px, height_px, _ = _resolve_page_size(page_size, units, 1, 60.0)
+        if orientation == 'landscape':
+            width_px, height_px = height_px, width_px
+        m_top, m_right, m_bottom, m_left = _normalize_margins(margins, units)
+        return (max(1.0, width_px - (m_left + m_right)),
+                max(1.0, height_px - (m_top + m_bottom)))
+
+    # Average rendered character advance as a fraction of the x-height. Used only
+    # to pre-estimate line capacity for chunk granularity, before any sampling.
+    _CHAR_ADVANCE_PER_XHEIGHT = 0.55
+
+    def _adaptive_chunk_chars(
+        self, input_lines, target_chars_per_chunk, page_size, units, margins,
+        orientation, x_stretch, writing_size_mm, auto_size,
+    ):
+        """Scale chunk granularity to the writing size, before any sampling.
+
+        At larger writing sizes a line holds fewer characters, so fixed ~25-char
+        chunks quantize badly: a line that holds 60 chars wraps after 2 chunks and
+        leaves up to a third of the width empty. This pre-estimates the page-fill
+        x-height from text statistics alone (the per-style stroke scale cancels
+        out of the solver when widths are expressed per character) and targets
+        ~3.5 chunks per line, clamped to [12, target_chars_per_chunk].
+        """
+        try:
+            from handwriting_synthesis.hand._draw import (
+                solve_fill_xheight_px, PX_PER_MM,
+                NATURAL_WRITING_MIN_FILL_MM, NATURAL_WRITING_MAX_FILL_MM,
+            )
+            if not auto_size:
+                return target_chars_per_chunk
+            n_chars = sum(len(ln.strip()) for ln in input_lines if ln.strip())
+            n_blank = sum(1 for ln in input_lines if not ln.strip())
+            if n_chars == 0:
+                return target_chars_per_chunk
+            content_w, content_h = self._content_box_px(page_size, units, margins, orientation)
+            xs = float(x_stretch) if x_stretch and float(x_stretch) > 0 else 1.0
+            if writing_size_mm is not None:
+                h_px = float(writing_size_mm) * PX_PER_MM
+            else:
+                # Express widths per character: raw width ~= chars * advance * xh,
+                # so passing model_xheight=1 cancels the style-dependent scale.
+                h_px = solve_fill_xheight_px(
+                    n_chars * self._CHAR_ADVANCE_PER_XHEIGHT, 1.0, n_blank,
+                    content_w, content_h, xs)
+                if not h_px:
+                    return target_chars_per_chunk
+                h_px = min(max(h_px, NATURAL_WRITING_MIN_FILL_MM * PX_PER_MM),
+                           NATURAL_WRITING_MAX_FILL_MM * PX_PER_MM)
+            # ~4.5 chunks per line: finer quanta let the line breaker land close
+            # to the margin (with ~3 chunks/line the achievable widths step by
+            # ~30%, which is what made wrapping look conservative).
+            chars_per_line = content_w / (self._CHAR_ADVANCE_PER_XHEIGHT * h_px * xs)
+            return int(min(target_chars_per_chunk, max(10, chars_per_line / 4.5)))
+        except Exception as exc:  # granularity heuristic must never block generation
+            print(f"Warning: adaptive chunk sizing failed, using default: {exc}")
+            return target_chars_per_chunk
+
+    def _auto_fill_writing_size(
+        self, chunk_strokes, n_blank_lines, page_size, units, margins,
+        orientation, x_stretch, model_xheight=None,
+    ):
+        """Pick a writing size (mm) so the text fills the page vertically.
+
+        A short letter at the base natural size only covers the top of the page;
+        a person writing the same letter by hand would simply write larger, and
+        someone with too much to say writes smaller to keep it on one page. This
+        solves for the x-height at which the wrapped text spans the content
+        height, clamped to [NATURAL_WRITING_MIN_FILL_MM, NATURAL_WRITING_MAX_FILL_MM].
+        Returns mm, or ``None`` to keep the default.
+        """
+        try:
+            from handwriting_synthesis.hand._draw import (
+                solve_fill_xheight_px, PX_PER_MM,
+                NATURAL_WRITING_MIN_FILL_MM, NATURAL_WRITING_MAX_FILL_MM,
+            )
+            if model_xheight is None:
+                model_xheight = self._measure_model_xheight(chunk_strokes)
+            if not model_xheight:
+                return None
+            content_w, content_h = self._content_box_px(page_size, units, margins, orientation)
+            total_raw_width = sum(
+                get_stroke_width(s) for s in chunk_strokes
+                if s is not None and len(s) > 0
+            )
+            xs = float(x_stretch) if x_stretch else 1.0
+            if xs <= 0:
+                xs = 1.0
+            h_px = solve_fill_xheight_px(
+                total_raw_width, model_xheight, n_blank_lines, content_w, content_h, xs)
+            if not h_px:
+                return None
+            # Short text -> grow toward the max; long text -> shrink toward the
+            # min so it fits one page with full-width lines (the wrap width and
+            # the rendered size must come from the SAME x-height).
+            return min(max(h_px / PX_PER_MM, NATURAL_WRITING_MIN_FILL_MM),
+                       NATURAL_WRITING_MAX_FILL_MM)
+        except Exception as exc:  # never block generation on a sizing heuristic
+            print(f"Warning: page-fill sizing failed, using default size: {exc}")
+            return None
+
     def _size_aware_max_line_width(
         self, chunk_strokes, max_line_width, page_size, units, margins,
-        orientation, writing_size_mm, x_stretch, auto_size,
+        orientation, writing_size_mm, x_stretch, auto_size, model_xheight=None,
     ):
         """Cap the wrap width so a full line still renders at the natural size.
 
@@ -325,26 +499,13 @@ def _size_aware_max_line_width(
         if not auto_size or not chunk_strokes:
             return max_line_width
         try:
-            from handwriting_synthesis.hand._draw import (
-                _resolve_page_size, _normalize_margins, _estimate_xheight,
-                PX_PER_MM, NATURAL_WRITING_SIZE_MM, WRAP_SIZE_CALIBRATION,
-            )
+            from handwriting_synthesis.hand._draw import PX_PER_MM, NATURAL_WRITING_SIZE_MM
 
-            width_px, height_px, _ = _resolve_page_size(page_size, units, 1, 60.0)
-            if orientation == 'landscape':
-                width_px, height_px = height_px, width_px
-            _, m_right, _, m_left = _normalize_margins(margins, units)
-            content_width_px = max(1.0, width_px - (m_left + m_right))
-
-            # Model x-height in raw units (robust median across the chunks).
-            xheights = []
-            for stroke in chunk_strokes:
-                if stroke is None or len(stroke) < 8:
-                    continue
-                xheights.append(_estimate_xheight(drawing.offsets_to_coords(stroke)))
-            if not xheights:
+            content_width_px, _ = self._content_box_px(page_size, units, margins, orientation)
+            if model_xheight is None:
+                model_xheight = self._measure_model_xheight(chunk_strokes)
+            if not model_xheight:
                 return max_line_width
-            model_xheight = float(np.median(xheights))
 
             target_mm = NATURAL_WRITING_SIZE_MM if writing_size_mm is None else float(writing_size_mm)
             target_xheight_px = max(1.0, target_mm * PX_PER_MM)
@@ -352,10 +513,12 @@ def _size_aware_max_line_width(
             if xs <= 0:
                 xs = 1.0
 
-            # Raw width whose rendered width == content width at the target size.
-            # WRAP_SIZE_CALIBRATION corrects for the per-chunk vs stitched-line
-            # x-height difference so the rendered size matches the requested one.
-            fit_raw = WRAP_SIZE_CALIBRATION * content_width_px * model_xheight / (target_xheight_px * xs)
+            # Raw width whose rendered width == content width at the target size:
+            #   rendered_width = raw_width * s_render * x_stretch, and
+            #   s_render = target_xheight_px / model_xheight  (height-driven sizing)
+            # => raw_width that fills the page = content_width * model_xheight
+            #    / (target_xheight_px * x_stretch). No empirical fudge factor needed.
+            fit_raw = content_width_px * model_xheight / (target_xheight_px * xs)
             return max(1.0, min(float(max_line_width), fit_raw))
         except Exception as exc:  # never block generation on a sizing heuristic
             print(f"Warning: size-aware wrap width failed, using max_line_width: {exc}")
@@ -446,6 +609,10 @@ def write_chunked(
         all_lines = []
         all_line_texts = []
 
+        # The writing size actually rendered. The no-override path may grow this
+        # (page-fill auto sizing) when the user did not request an explicit size.
+        effective_writing_size_mm = writing_size_mm
+
         # If we have overrides, use SPACE-PLACEHOLDER approach
         # KEY FIX: Chunk the ORIGINAL text first, THEN replace override chars in each chunk.
         # This preserves the position mapping between chunks and the original text.
@@ -630,32 +797,36 @@ def write_chunked(
             # No overrides - use original logic
             all_line_segment_data = None
 
+            # PHASE 1: chunk + sample every input line up front, so the writing
+            # size can be chosen from the WHOLE text before any line wrapping.
+            # Chunk granularity follows the estimated writing size: larger writing
+            # means fewer characters per line, which needs smaller chunks to wrap
+            # without leaving big quantization gaps at the right margin.
+            eff_target_chars = self._adaptive_chunk_chars(
+                input_lines, target_chars_per_chunk, page_size, units, margins,
+                orientation, x_stretch, writing_size_mm, auto_size,
+            )
+            valid_char_set = set(drawing.alphabet)
+            sampled_lines = []  # per input line: None (blank) or (chunks, strokes)
             for input_line in input_lines:
-                # Handle blank lines
                 if not input_line.strip():
-                    all_lines.append(np.empty((0, 3)))
-                    all_line_texts.append('')
+                    sampled_lines.append(None)
                     continue
 
                 # Split line into chunks with adaptive sizing
                 chunks = split_text_into_chunks(
                     input_line,
                     words_per_chunk=words_per_chunk,
-                    target_chars_per_chunk=target_chars_per_chunk,
+                    target_chars_per_chunk=eff_target_chars,
                     min_words=min_words_per_chunk,
                     max_words=max_words_per_chunk,
                     adaptive_chunking=adaptive_chunking,
                     adaptive_strategy=adaptive_strategy
                 )
-
                 if not chunks:
-                    all_lines.append(np.empty((0, 3)))
-                    all_line_texts.append('')
+                    sampled_lines.append(None)
                     continue
 
-                # Expand valid character set with overrides
-                valid_char_set = set(drawing.alphabet)
-
                 # Validate characters
                 for chunk_num, chunk in enumerate(chunks):
                     for char in chunk:
@@ -665,62 +836,65 @@ def write_chunked(
                                 f"Valid character set is {valid_char_set}"
                             )
 
-                # Generate strokes for all chunks
                 chunk_strokes = self._sample(
                     chunks,
                     biases=[biases] * len(chunks) if biases is not None else None,
                     styles=[styles] * len(chunks) if styles is not None else None
                 )
-
-                # Wrap to the page at the natural size: cap line width so a full
-                # line renders at the target x-height instead of being shrunk.
-                effective_max_line_width = self._size_aware_max_line_width(
-                    chunk_strokes, max_line_width, page_size, units, margins,
-                    orientation, writing_size_mm, x_stretch, auto_size,
+                sampled_lines.append((chunks, chunk_strokes))
+
+            # PHASE 2: choose the writing size, then the wrap width that fills the
+            # page at that size. When the user did not pick a size, grow it (up to
+            # a natural cap) so short texts fill the page vertically instead of
+            # leaving the bottom half empty -- the way a real one-page letter is
+            # simply written larger. Long texts stay at the base natural size.
+            all_strokes_flat = [s for entry in sampled_lines if entry for s in entry[1]]
+            n_blank_lines = sum(1 for entry in sampled_lines if entry is None)
+            # Measure the x-height on line-sized STITCHED groups -- the statistic
+            # _draw actually scales by -- so wrap-time predictions match the
+            # rendered size and lines reach the right margin.
+            stitched_xheight = self._estimate_stitched_xheight(
+                sampled_lines, chunk_spacing, rotate_chunks)
+            if auto_size and writing_size_mm is None and all_strokes_flat:
+                fitted_mm = self._auto_fill_writing_size(
+                    all_strokes_flat, n_blank_lines, page_size, units, margins,
+                    orientation, x_stretch, model_xheight=stitched_xheight,
                 )
+                if fitted_mm:
+                    effective_writing_size_mm = fitted_mm
 
-                # Stitch chunks into lines based on actual widths
-                current_line_stroke = np.empty((0, 3))
-                current_line_text = []
-                current_line_width = 0.0
-
-                for chunk_text, chunk_stroke in zip(chunks, chunk_strokes):
-                    chunk_width = get_stroke_width(chunk_stroke)
-
-                    # Check if chunk fits on current line
-                    potential_width = current_line_width
-                    if current_line_width > 0:
-                        potential_width += chunk_spacing + chunk_width
-                    else:
-                        potential_width = chunk_width
-
-                    if potential_width <= effective_max_line_width or current_line_width == 0:
-                        # Chunk fits on current line
-                        if current_line_width > 0:
-                            current_line_stroke = stitch_strokes(
-                                current_line_stroke,
-                                chunk_stroke,
-                                chunk_spacing,
-                                rotate_to_match=rotate_chunks
-                            )
-                            current_line_text.append(chunk_text)
-                        else:
-                            current_line_stroke = chunk_stroke
-                            current_line_text.append(chunk_text)
-                        current_line_width = potential_width
-                    else:
-                        # Start new line (width exceeded)
-                        all_lines.append(current_line_stroke)
-                        all_line_texts.append(' '.join(current_line_text))
+            effective_max_line_width = self._size_aware_max_line_width(
+                all_strokes_flat, max_line_width, page_size, units, margins,
+                orientation, effective_writing_size_mm, x_stretch, auto_size,
+                model_xheight=stitched_xheight,
+            )
+            # Allow a squeeze past the wrap limit: a writer fits one more word by
+            # tightening slightly rather than leaving a ragged gap. _draw condenses
+            # such lines by the same few percent per line (line_scale_x).
+            from handwriting_synthesis.hand._draw import LINE_SQUEEZE_TOLERANCE
+            squeeze_limit = effective_max_line_width * LINE_SQUEEZE_TOLERANCE
+
+            # PHASE 3: break chunks into balanced lines (raggedness spread evenly
+            # rather than greedy fill), then stitch each line's chunks together.
+            for entry in sampled_lines:
+                if entry is None:
+                    all_lines.append(np.empty((0, 3)))
+                    all_line_texts.append('')
+                    continue
+                chunks, chunk_strokes = entry
 
-                        current_line_stroke = chunk_stroke
-                        current_line_text = [chunk_text]
-                        current_line_width = chunk_width
+                widths = [get_stroke_width(s) for s in chunk_strokes]
+                breaks = balanced_line_breaks(
+                    widths, chunk_spacing, effective_max_line_width, squeeze_limit)
 
-                # Add last line from this input line
-                if len(current_line_stroke) > 0 or len(current_line_text) > 0:
-                    all_lines.append(current_line_stroke)
-                    all_line_texts.append(' '.join(current_line_text))
+                for start, end in breaks:
+                    line_stroke = chunk_strokes[start]
+                    for nxt in chunk_strokes[start + 1:end]:
+                        line_stroke = stitch_strokes(
+                            line_stroke, nxt, chunk_spacing,
+                            rotate_to_match=rotate_chunks)
+                    all_lines.append(line_stroke)
+                    all_line_texts.append(' '.join(chunks[start:end]))
 
         # Use the collected lines
         lines = all_lines
@@ -788,7 +962,7 @@ def _normalize_seq(value, desired_len, cast_fn=None, name='param'):
             empty_line_spacing=empty_line_spacing,
             auto_size=auto_size,
             manual_size_scale=manual_size_scale,
-            writing_size_mm=writing_size_mm,
+            writing_size_mm=effective_writing_size_mm,
             character_override_collection_id=character_override_collection_id,
             overrides_dict=overrides_dict,
             margin_jitter_frac=margin_jitter_frac,
diff --git a/handwriting_synthesis/hand/_draw.py b/handwriting_synthesis/hand/_draw.py
index 1408076..1c200f2 100644
--- a/handwriting_synthesis/hand/_draw.py
+++ b/handwriting_synthesis/hand/_draw.py
@@ -29,11 +29,64 @@
 # condensed per-line at render time instead of shrinking the whole document.
 WIDTH_OUTLIER_FACTOR = 2.0
 
-# Empirical correction for the size-aware wrap width (Hand.write_chunked). The wrap
-# width is derived from per-chunk x-heights, but _draw renders stitched, aligned,
-# de-noised lines whose measured x-height is a bit smaller, so without this lines
-# would render ~20% under the requested size. Calibrated against real model output.
-WRAP_SIZE_CALIBRATION = 0.82
+# Bounds for the auto page-fill writing size. Short texts grow (like a real
+# one-page letter written larger) up to the max; beyond it, blank space at the
+# bottom looks more natural than huge letters. Long texts shrink below the base
+# natural size down to the min so they fit one page with full-width lines --
+# crucially the WRAP width shrinks with the render size, otherwise _draw's
+# fallback shrink leaves every line short of the right margin.
+NATURAL_WRITING_MIN_FILL_MM = 2.5
+NATURAL_WRITING_MAX_FILL_MM = 7.0
+
+# Fraction of the content height the page-fill solver aims to use. Below 1.0 to
+# absorb what the closed-form estimate ignores: the first-line offset, integer
+# line rounding, and inter-chunk stitch gaps widening lines slightly.
+PAGE_FILL_FRACTION = 0.92
+
+# How far past the wrap budget a single line may go before it is condensed
+# horizontally (line_scale_x) at render time. A writer squeezes the last word in
+# rather than leaving a gap; an 8% horizontal tightening is visually invisible.
+# Used by the wrapper (line-break limit) and by the global width clamp, which
+# tolerates this much overhang on the widest line instead of shrinking ALL text.
+LINE_SQUEEZE_TOLERANCE = 1.08
+
+
+def solve_fill_xheight_px(
+    total_raw_width,
+    model_xheight,
+    n_blank_lines,
+    content_width_px,
+    content_height_px,
+    x_stretch=1.0,
+    spacing_per_xheight=LINE_SPACING_PER_XHEIGHT,
+    fill_frac=PAGE_FILL_FRACTION,
+):
+    """Solve for the x-height (px) at which wrapped text fills the page height.
+
+    At rendered x-height ``h`` the text scales by ``h / model_xheight``, so it
+    wraps into roughly ``n(h) = total_raw_width * h * x_stretch / (model_xheight
+    * content_width)`` lines, each advancing ``spacing_per_xheight * h``; blank
+    (paragraph-break) lines add the same advance without consuming text. Setting
+    the resulting height to ``fill_frac * content_height`` gives a quadratic in
+    ``h``::
+
+        a*h^2 + b*h - fill_frac*content_height = 0,
+        a = spacing_per_xheight * total_raw_width * x_stretch
+            / (model_xheight * content_width)
+        b = spacing_per_xheight * n_blank_lines
+
+    Returns the positive root, or ``None`` if the inputs are degenerate. The
+    caller is expected to clamp the result to a sensible size range.
+    """
+    if total_raw_width <= 0 or model_xheight <= 0 or content_width_px <= 0 or content_height_px <= 0:
+        return None
+    a = spacing_per_xheight * total_raw_width * max(x_stretch, 1e-6) / (model_xheight * content_width_px)
+    b = spacing_per_xheight * max(0, n_blank_lines)
+    c = -fill_frac * content_height_px
+    disc = b * b - 4.0 * a * c
+    if disc <= 0 or a <= 0:
+        return None
+    return (-b + math.sqrt(disc)) / (2.0 * a)
 
 
 def _estimate_xheight(ls):
@@ -776,7 +829,7 @@ def _draw(
                 raw_w = max(1e-6, ls[:, 0].max())
                 raw_h = max(1e-6, ls[:, 1].max())
                 raw_heights.append(raw_h)          # full extent (override matching)
-                xheights.append(_estimate_xheight(ls))  # robust body height (sizing)
+                xheights.append((_estimate_xheight(ls), raw_w))  # body height (sizing)
                 line_gen_raw_w += raw_w
 
                 preprocessed_segments.append({
@@ -809,7 +862,17 @@ def _draw(
     writing_mm = NATURAL_WRITING_SIZE_MM if writing_size_mm is None else float(writing_size_mm)
     target_xheight_px = max(1.0, writing_mm * PX_PER_MM)
 
-    typical_xheight = float(np.median(xheights)) if xheights else target_xheight_px
+    # Typical x-height from the LONG segments only: on short lines (a signature,
+    # a paragraph's last few words) ascenders/descenders are a large fraction of
+    # the points, which inflates the percentile band and would make all text
+    # render smaller and narrower than the wrap predicted.
+    if xheights:
+        max_seg_w = max(w for _, w in xheights)
+        long_bands = [h for h, w in xheights if w >= 0.5 * max_seg_w]
+        typical_xheight = float(np.median(long_bands if long_bands
+                                          else [h for h, _ in xheights]))
+    else:
+        typical_xheight = target_xheight_px
     size_scale = target_xheight_px / max(1e-6, typical_xheight)
 
     if auto_size:
@@ -817,13 +880,16 @@ def _draw(
         # Width clamp: fit every NORMAL line within the page, ignoring gross
         # outliers (a single unwrapped long line is condensed per-line at render
         # time via line_scale_x instead of shrinking every line -- which is what
-        # used to make the text tiny).
+        # used to make the text tiny). The clamp tolerates LINE_SQUEEZE_TOLERANCE
+        # of overhang on the widest line: that line is condensed individually,
+        # so one well-packed line doesn't scale the whole document down.
         if line_raw_widths:
             median_w = float(np.median(line_raw_widths))
             normal_widths = [w for w in line_raw_widths if w <= WIDTH_OUTLIER_FACTOR * median_w]
             width_ref = max(normal_widths) if normal_widths else median_w
             if width_ref > 1e-6:
-                s_global = min(s_global, content_width_px / (width_ref * x_stretch))
+                s_global = min(s_global, LINE_SQUEEZE_TOLERANCE * content_width_px
+                               / (width_ref * x_stretch))
     else:
         # manual_size_scale is now a multiple of the natural size (1.0 == natural).
         s_global = float(manual_size_scale) * size_scale
diff --git a/handwriting_synthesis/hand/operations/__init__.py b/handwriting_synthesis/hand/operations/__init__.py
index d5a826d..c693231 100644
--- a/handwriting_synthesis/hand/operations/__init__.py
+++ b/handwriting_synthesis/hand/operations/__init__.py
@@ -14,7 +14,7 @@
     calculate_adaptive_spacing,
     stitch_strokes,
 )
-from .chunking import split_text_into_chunks
+from .chunking import split_text_into_chunks, balanced_line_breaks
 from .sampling import sample_strokes
 
 __all__ = [
@@ -26,5 +26,6 @@
     'calculate_adaptive_spacing',
     'stitch_strokes',
     'split_text_into_chunks',
+    'balanced_line_breaks',
     'sample_strokes',
 ]
diff --git a/handwriting_synthesis/hand/operations/chunking.py b/handwriting_synthesis/hand/operations/chunking.py
index ba1427e..5c668aa 100644
--- a/handwriting_synthesis/hand/operations/chunking.py
+++ b/handwriting_synthesis/hand/operations/chunking.py
@@ -1,6 +1,64 @@
 """Text chunking logic for improved handwriting generation."""
 
-from typing import List, Optional
+from typing import List, Optional, Tuple
+
+
+def balanced_line_breaks(
+    widths: List[float],
+    spacing: float,
+    target: float,
+    limit: float,
+) -> List[Tuple[int, int]]:
+    """Choose line breaks over measured chunk widths, minimising raggedness.
+
+    Greedy filling makes line lengths erratic (one line packed past the budget,
+    the next stopping at 60%), which reads as a jagged right margin. This is the
+    classic dynamic-programming line-breaking approach applied to chunk widths:
+    every line except the last pays for its deviation from the target, so slack
+    is spread evenly across lines instead of accumulating in one. The penalty is
+    asymmetric: undershoot pays the full quadratic, overshoot (up to ``limit``)
+    only a quarter -- a slightly over-full line is condensed a few percent at
+    render time, which looks like natural cramming, whereas an under-full line
+    leaves a visible gap at the margin.
+
+    Args:
+        widths: Measured raw width of each chunk, in order.
+        spacing: Horizontal gap added between chunks on a line.
+        target: Ideal line width (the wrap budget).
+        limit: Hard maximum line width (target plus any squeeze allowance). A
+            single chunk wider than the limit still gets a line of its own.
+
+    Returns:
+        List of (start, end) index pairs, one per line, covering all chunks.
+    """
+    n = len(widths)
+    if n == 0:
+        return []
+    inf = float('inf')
+    best = [0.0] + [inf] * n
+    back = [0] * (n + 1)
+    for j in range(1, n + 1):
+        w = 0.0
+        for i in range(j - 1, -1, -1):
+            w = widths[i] + (spacing + w if w > 0 else 0.0)
+            if w > limit and i < j - 1:
+                break  # adding earlier chunks only widens the line further
+            if j == n:
+                penalty = 0.0           # the final line may be any length
+            elif w <= target:
+                penalty = (target - w) ** 2
+            else:
+                penalty = 0.25 * (w - target) ** 2  # mild: overshoot is condensed
+            if best[i] + penalty < best[j]:
+                best[j] = best[i] + penalty
+                back[j] = i
+    lines = []
+    j = n
+    while j > 0:
+        i = back[j]
+        lines.append((i, j))
+        j = i
+    return lines[::-1]
 
 
 # Tokens that mark the end of a sentence -- strong, high-priority break points.
diff --git a/tests/test_operations.py b/tests/test_operations.py
index 25d9d2a..18cb841 100644
--- a/tests/test_operations.py
+++ b/tests/test_operations.py
@@ -13,7 +13,10 @@
 # Make the project importable when run directly (python tests/test_operations.py).
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)))
 
-from handwriting_synthesis.hand.operations.chunking import split_text_into_chunks
+from handwriting_synthesis.hand.operations.chunking import (
+    split_text_into_chunks,
+    balanced_line_breaks,
+)
 
 
 # Tokens longer than this are hard-split by the chunker (see chunking.py).
@@ -94,6 +97,48 @@ def test_whitespace_and_empty_inputs():
     assert lead_trail[-1].endswith("   "), lead_trail
 
 
+def _line_widths(widths, spacing, breaks):
+    out = []
+    for i, j in breaks:
+        w = sum(widths[i:j]) + spacing * (j - i - 1)
+        out.append(w)
+    return out
+
+
+def test_balanced_breaks_cover_all_chunks_in_order():
+    widths = [90.0, 110.0, 100.0, 95.0, 105.0, 80.0, 120.0]
+    breaks = balanced_line_breaks(widths, 8.0, target=250.0, limit=260.0)
+    flat = [k for i, j in breaks for k in range(i, j)]
+    assert flat == list(range(len(widths))), breaks
+    # No line exceeds the limit (none of these single chunks is oversized)
+    assert all(w <= 260.0 for w in _line_widths(widths, 8.0, breaks)), breaks
+
+
+def test_balanced_breaks_spread_slack():
+    """DP must not leave one line nearly empty when even splits exist.
+
+    Greedy on these widths gives lines of 240 and 60; balanced breaking
+    should split 150/150 (both near-ish target, far better balance).
+    """
+    widths = [120.0, 120.0, 30.0, 30.0]
+    breaks = balanced_line_breaks(widths, 0.0, target=160.0, limit=240.0)
+    line_w = _line_widths(widths, 0.0, breaks)
+    assert len(line_w) >= 2
+    # the non-final lines must be closer to target than greedy's worst case
+    assert min(line_w[:-1]) >= 120.0, line_w
+
+
+def test_balanced_breaks_oversized_chunk_gets_own_line():
+    widths = [50.0, 500.0, 50.0]
+    breaks = balanced_line_breaks(widths, 5.0, target=200.0, limit=210.0)
+    assert (1, 2) in breaks, breaks  # the huge chunk stands alone
+
+
+def test_balanced_breaks_empty_and_single():
+    assert balanced_line_breaks([], 5.0, 100.0, 105.0) == []
+    assert balanced_line_breaks([42.0], 5.0, 100.0, 105.0) == [(0, 1)]
+
+
 def test_progress_guaranteed_with_degenerate_min_words():
     """min_words=0 must not cause an infinite loop."""
     chunks = split_text_into_chunks(
diff --git a/tests/test_sizing.py b/tests/test_sizing.py
index 346307b..352f473 100644
--- a/tests/test_sizing.py
+++ b/tests/test_sizing.py
@@ -21,7 +21,10 @@
 
 from handwriting_synthesis import drawing
 from handwriting_synthesis.hand import _draw as draw_mod
-from handwriting_synthesis.hand._draw import _draw, PX_PER_MM, NATURAL_WRITING_SIZE_MM
+from handwriting_synthesis.hand._draw import (
+    _draw, PX_PER_MM, NATURAL_WRITING_SIZE_MM,
+    solve_fill_xheight_px, LINE_SPACING_PER_XHEIGHT,
+)
 
 _COORD = re.compile(r'[ML]\s*([-\d.]+)[\s,]+([-\d.]+)')
 
@@ -149,6 +152,32 @@ def test_shrinks_to_fit_one_page():
     assert _rendered_xheight_mm(path) < 5.0                              # was scaled down
 
 
+def test_fill_solver_fills_target_height():
+    """The solved x-height plugs back into the height model at the fill target."""
+    W, mxh, content_w, content_h = 5000.0, 20.0, 600.0, 900.0
+    h = solve_fill_xheight_px(W, mxh, 0, content_w, content_h, fill_frac=0.92)
+    assert h and h > 0
+    n_lines = W * h / (mxh * content_w)
+    height = n_lines * LINE_SPACING_PER_XHEIGHT * h
+    assert abs(height - 0.92 * content_h) < 1e-6, (height, 0.92 * content_h)
+
+
+def test_fill_solver_monotonic():
+    """More text or more blank lines -> smaller solved size; both reduce h."""
+    args = dict(model_xheight=20.0, content_width_px=600.0, content_height_px=900.0)
+    h_short = solve_fill_xheight_px(2000.0, n_blank_lines=0, **args)
+    h_long = solve_fill_xheight_px(20000.0, n_blank_lines=0, **args)
+    h_blanks = solve_fill_xheight_px(2000.0, n_blank_lines=5, **args)
+    assert h_long < h_short, (h_long, h_short)
+    assert h_blanks < h_short, (h_blanks, h_short)
+
+
+def test_fill_solver_degenerate_inputs():
+    assert solve_fill_xheight_px(0.0, 20.0, 0, 600.0, 900.0) is None
+    assert solve_fill_xheight_px(100.0, 0.0, 0, 600.0, 900.0) is None
+    assert solve_fill_xheight_px(100.0, 20.0, 0, 0.0, 900.0) is None
+
+
 def test_manual_scale_is_multiple_of_natural():
     """auto_size=False: manual_size_scale=2 renders ~2x the natural size."""
     segs = [[_segment("hello world", line_idx=i)] for i in range(3)]
diff --git a/webapp/init_db.py b/webapp/init_db.py
index e230cf0..f85a70a 100644
--- a/webapp/init_db.py
+++ b/webapp/init_db.py
@@ -155,6 +155,42 @@ def init_database():
                 _reconcile_missing_columns()
 
 
+# Standard page sizes the UI expects (names must match the engine's PAPER_SIZES_MM
+# and the frontend's predefined-size list so they resolve correctly).
+DEFAULT_PAGE_SIZES = [
+    ('A4', 210.0, 297.0),
+    ('A5', 148.0, 210.0),
+    ('Letter', 215.9, 279.4),
+    ('Legal', 215.9, 355.6),
+]
+
+
+def seed_default_page_sizes():
+    """Seed the standard system page sizes if they are missing.
+
+    Without these the page-size dropdown in the UI is empty, which forces every
+    generation onto the A4 fallback and hides the size options. Idempotent: only
+    inserts names that are not already present, so it is safe to run on every init.
+    """
+    from models import PageSizePreset
+    with app.app_context():
+        existing = {row[0] for row in db.session.query(PageSizePreset.name).all()}
+        created = []
+        for name, width, height in DEFAULT_PAGE_SIZES:
+            if name in existing:
+                continue
+            db.session.add(PageSizePreset(
+                name=name, width=width, height=height, unit='mm',
+                is_active=True, is_default=True, created_by=None,
+            ))
+            created.append(name)
+        if created:
+            db.session.commit()
+            print(f"Seeded default page sizes: {', '.join(created)}")
+        else:
+            print("Default page sizes already present.")
+
+
 def create_admin_user():
     """
     Create a default admin user interactively.
@@ -290,8 +326,11 @@ def main():
     # Initialize database
     init_database()
 
+    # Seed system defaults the UI depends on (page-size dropdown).
+    seed_default_page_sizes()
+
     if args.auto:
-        # Automatic mode - just run migrations and exit
+        # Automatic mode - schema + system defaults, then exit
         print("Database initialization completed (auto mode)")
         return
 
diff --git a/webapp/instance/writebot.db.pre-email-fix.20260531_092148.bak b/webapp/instance/writebot.db.pre-email-fix.20260531_092148.bak
new file mode 100755
index 0000000000000000000000000000000000000000..83d292285fd07e235a1334659508d24c4f5924dc
GIT binary patch
literal 122880
zcmeI)&2!u6eFt!YdZFIw#Mv;;#%n0EnToS(82|}V)K13X+FVDqEq7%(iDwTe2t1Uq
zrU)(o%97ipL%X{(eL1wJPLoTU>9MCC+EY%+KafMFr(W6~(o-_&<kG$L_khGJU+iSG
zGg{4;yQanG0iKu7^WeerJfQsW{d<<j#Fp(G>7H21T*?SS=3Ane$z(R@&l~h7cwM0%
zW`hs(w-Ebz(vKUN&1WCY@q_c9@?%~b{59!tghK!V5P$##AOHafKmY;|fB*y_aP|UU
zT3Mw;vui>opQksxAOHafKmY;|fB*y_009U<;Nl8AU(T*&)}Oz>URbQxg<N6LW3H#^
z=8@HL4d=A$ZI_g4WovtDTYl4f?5O64^~cJw{otgb{NbC1(qfgCWRy#0S*_G6s;sNc
zs<c(!Dye0)R;d`}ie8p#hQv&xTxO~+waSK3QCd<>YUzs9+|s48E;FT4Q4F;un=QRs
ztCpo^#W3`$$`q+q-BJ|2#nf`OV#-WzRhS~1<(k=QmMZ&HR<X{@>K9m%l*)BUy<U<<
zN!ga9?NW79t*8~LHWH(jHcMq?OD!$l{N<o=n{C~A((edsnQPDAzxLW<J;OCjc4TWk
zmpP*t)?HIIy}QRx-gDTyA3ARuEz?k#L^Z5cS=pc(OQs^1Wm8gXW|JgTr7G(!St^+n
zS*yzQiEZhuYErG6HN7cUTgq06{;nw1vRPA0N^?s!q%BsH6p0Y4hFsN4hGZCuBA28}
zd8<`cs_Is0OExOCt=m*o@evi37k8w>L{Y9*B_&niC8~Tj|KFMX|I!;?5P$##AOHaf
zKmY;|fB*y_009VmZUxq6)`e(SC%ZCpE!x)L&;J(||2mWZgSEZYzgqd%<zFptEa?=0
z7X%;x0SG_<0uX?}XD-m+TUg2n*LJQw&($;8u71F@V~ahl8wPV-W{O9;)e-e>w{01^
zXW1Q*A0jK&O^N=K@7vF;c3Z!pZc5_Dcdd?TKXt|3eNm>^cSH)Pl;06gD&?YBrzF^S
zS@W*t-B9VdP^ltr-2L|c{d;eVZR-gW?=a(uT@-H}I`$E}A(v}@29o$tZ|RN|oYF84
zqPY0urt)@4+Agb`YN=e2Oa0q9s?^mRgGwE+4s&#m-5LIEiYn#VVeWaXhEmy<Yny6K
zmMU`p_Wb9z8tS$pZ&qcwTvhwG=cZOeKF6(5+E%EWOSMvEtG_o()wlpxic;E^>4H_N
zwTjZeJ@ff&8~;(eTHCBvYcgH%d)bT7HiJshw*+c6wI=nyBT%IZj|Zbz+di;5{7JhQ
zEvrQeJfK0EhG@6vkJznVyM228#<j3wRIQrKpZ_o9f1b&In*VM7SNZ?R|5yG$^8Zc&
zctHRH5P$##AOHafKmY;|fB*y_aDD~mW&|O-;J@em_j&()&VQfv-)H>y?CgxNNLw|0
z{lA#W|7ZTE`G3lPoNwjt<Toe)F9<*Y0uX=z1Rwwb2tWV=5P-l%6nJB%E=>Lw)52(U
z{M$jfG10Ma-OP`Pj(wYDZftb;y^z^-bn^QaGvlL!Z!To7&eRv<6-z$<=imPq@;}Yc
z|9C+F0uX=z1Rwwb2tWV=5P$##AaH&K-r>KRur4eJ*<3vvt@LMCmVWlFn-72VYbtpZ
zDoW+;;m$?1wpFY0=l??f*BSaBF9<*Y0uX=z1Rwwb2tWV=5P$##E{H&WR#;z*?Gg;u
z|G&xPe{(@9h-yFp0uX=z1Rwwb2tWV=5P$##AV31EGwZS62N<mXKc#>F4>%Bj00bZa
z0SG_<0uX=z1Rwwb2wXq`|KI=PzyF`PbOCFJDnS4O5P$##AOHafKmY;|fB*y_5DM_;
z|5*Qr9QXkO5P$##AOHafKmY;|fB*y_a3KZo{Qp8WD=Gy62tWV=5P$##AOHafKmY;|
zNC@Ehe}V%aAOHafKmY;|fB*y_009U<00I|M0MGv~WV51D5P$##AOHafKmY;|fB*y_
z0D*)6p8qE}@Bso4fB*y_009U<00Izz00ba#AqBAhzmUy}N<jbu5P$##AOHafKmY;|
zfB*y%0$BeiIPd`i5P$##AOHafKmY;|fB*y_a3KZo`~MfRSy3qnKmY;|fB*y_009U<
z00IzzKtf=3`G=X6xyzZgKVF+z`Nx%SEdO}%@0K4giVOcR|8EPo<}T0vP4=I&tHQI)
zFX_O~=+)1EAy@cjN$9`x8D)TEApijgyqds^^_5)V#tq@)k37BEW?lV&X|DB*X<di8
z%yUNqR&VV!>iZ3Gzy9vMhBy)}ZY(WYrns}a-?-D*6L%l%i@T5R-FsVHTI}dYO#DuL
z@7A~LdmFMO6%(;fEz>&`-@EsqPG`i<I%L+tp*JR?*Ri}<a>>*zSJMs8I%eX#4<6iW
z)OQnUQmn~Zdavz`jW!&nd(70Dr$Ie<@q<v^6K@m1{?7eIBGF#g^v{aL^V9xlZ|8n}
z@3DBd@mSoj%pwna@4;SU=guw<+X#|vo)*Qu#(Ryu#_p}gL$T*F$MsKGD!y1;$`y8Y
zgpXJKHZl%%hbqS$%|2$1W0_1d>~@<`eeI4rJ$5<Kdee`6<)&0qO=&XMaI7vbj=10W
z-hNy!GELwG3NgGy`~=+T%{-3&(eBRsj~e0z?~K1sTYRuQU$A$CjMXvOiThF8@|dRg
zJlp@QO$XKF>G3N+Sj-h}T^IUS{M4)yt>^mJqo;e8>sf|N0o3f4nGRnXHtWvr?Z)@S
zDJO~#cByM66BRdxsTO++G&Q08S10DryLkHi#g}Hz&l;aIk14*mIiD+Bzb<?t`~5qe
ze)@AR(TCHqQ+n;Nn!!b#=wN;k`E{KLb&n6U1J+><H2_^(!I`{6H+9cAoD}KVp5E5l
zR)@L4AuJpe`PqX5!tOBjsIFrh%yq5Kf#z8>g!*G7Ph`SHF|OO+da(O&f3Hr%vN-5N
zyuMn`5QFPsX!Ew{w+K%?K1c-}WYP$AGdEv&czx1Uk!~VQPKPYe&5HlY^fPn0LgOu=
ze>LiY)Z_iJ-102q<OHs#A9ejQMG^C(y>P;bgPxd(6gR?Ti_d0rg*uhswMiuyVx;n)
zds+U&)5hjLj4M9ZW^#qM-V*-2H@p%OrA*~Go4O7X$Nrv|!O-(m^G>_b&H7aac&n@F
zrb&|qH@qpAReyvX6q_!i=yt8BgQ!q_{4JH8c#f`7kJQ8a$hgk^yY$o#vh#%-1>w))
zjXM}t`5l25)two-4lj+^m`aiU)#J}({E^hZN%OP(`^k%|LatCK2%lUDo6q%=%#Jh_
z-P<Qll!Ej^$gR{3wyV4DQ`<4ML)|?LkK(9_(W}*Kw>7`4;jD{Vl3&}7-HuvUNu@sh
zruwowbz65mt!*Dzov=-Lj>Gwu=6a`;^>BI^k7!!D8x3vI;g0>ppWoO{RM9~tQpXG?
zkg=50U1h1rpa17GKgs03zxwwp|F?3m{EM74|1a~aGmo>s&Hf;JS@^N=4T?UWuh#3i
zLUmo}3qjW%+<`u{o8hz|3V9<v8bwjT9D)x-QD`&?_>-^F2u%5eBXD$(X0SGMXmHTo
zCo~I*BCbx!E_qHm*JNzbzCtq)$~>HbM0sdY&x-=THZ9}fG@j;^sj5aP6|Jx23N^}b
zZj5Tzv24fkPNVEzpORg0bUL#jyx1$yjfJubZ!A%|8co}o*=y6HL-(TKuTIG>envXW
zcwDjh<@tiTE_{DX1?fKO_1q}GE7S7xj~&U$4=sKyzm_YkuM2;9-5>a(6r-QMGE950
zZj8cTG7eHV3r35?*ydSYd<aVqeS@0CM~uYTL7R+Cn#MnhKjw4Raf`<|ZeoawdQ}*l
z$dt<D@MUFejl%~(!#?V^*_en4(>trv?Ri?uYSS8BrzLYRwyU(F=IPlzZ~6g_{gf7r
zJ-Qvo21~bR@JZ%q1g$V_XHqat#rR?(ygSFHs&oqogOe9|SYo~hz)Rev_3RM|(1JHu
zF0v!tqI;L^SuM-(C*PXOClJF1;b*0XcOLe;%mn&h|5C2-0j+j+f-A#g)DnDNJly*r
z-{fuxb;k5Dh4dwpK44rT{3=QxCvJ?%t@x*x2P?AICzg@l-_vP*d043Q3?_ZtU~(|F
zF#dezFTVaoe_{5wPyNd}eR}%y<-{ePj-7I$kIYdL*Uh-?gfUh@Z%&*`<2Q91zXOHc
zPD#yoj&z4+E1G9_qxr`$%!&4%c=sBHHf_(dk46q|v(|8*Dt7GDT`Imn)Z!al{KQzG
zmOh)uQz!LK{k5e=yK@FD3!{>FG-KBEwsp`sN=&y?H9lzDO<JG2hE8K>xSZtM82;X0
z{3<)rJ3W3V-x5q6-_(sK2M!G+Cf|P=ouBa;9g8mMF0D`ey7^PUDBSg;Dc(q^KdFmC
z52-z$+0>v^BF|y80*{JsvW{)J;h4>-!u7Ql+g;=lPaVCh`HTAKd_Sz`S#9gtpudDL
zG#{s(onGe&4c^giKwL>Tb=vMa9v;S<BsNfmkyK<FT@Fseo;aM1^Qk=DbI8ZfCFsb5
z8c?>Q?R=!43@lUidbO#0v<K&B5lxfMa$j@I6`I)KOvev4J9&ZPgLrUHow7R`Jy08r
zk>&~Sj>GNHXh$yU3&F0b`^DtgJ>f(?SdTeb+=%63nQuqO6#JJh<qG%d5#RM-)E;yG
z)RKK6Su(28$eu6Vs)Mk}B@B)z_P>_T6&_KMzB#o>k(*_P$%os_?xj~MRy>i(MT;L*
ze4(%93Z;_piRIsD#$-JvD4)39j5%P+9VgLt!3CGNbA;)ISF4;{UGu%caBCuQ>aoS1
zHtiGbBpO_TpwrQy*d;tRmH7Q3aX^!XSARgYTP=E26QvwnG_i?h;=~CP%a={45@v~O
zIkXD9O6(F0o?`SftGU9>8$$nD&;>_^0DnF*61p~Q=k#o5<iKz`GZHi0@5k@|pU<rg
zlnMe6fB*y_009U<00Izz00ba#mI7G+pQR?!f&c^{009U<00Izz00bZa0SKH=0sQ-a
z=d)c<DhNOT0uX=z1Rwwb2tWV=5P-m03gG$wS!yCJ2tWV=5P$##AOHafKmY;|fWY|_
z!217uwhKxH0SG_<0uX=z1Rwwb2tWV=5I9Q#tpCqa6KO#J0uX=z1Rwwb2tWV=5P$##
z&Zhv@|L3z^P$~#O00Izz00bZa0SG_<0uX?}SqfnNf0mj^3jz>;00bZa0SG_<0uX=z
m1R!ue1+e}<pY4KDK>z{}fB*y_009U<00Izz00hob;Qs*%o>Kh)

literal 0
HcmV?d00001

diff --git a/webapp/static/js/modules/alpine-app.js b/webapp/static/js/modules/alpine-app.js
index 1bb8611..29f81cb 100644
--- a/webapp/static/js/modules/alpine-app.js
+++ b/webapp/static/js/modules/alpine-app.js
@@ -67,6 +67,7 @@ document.addEventListener('alpine:init', () => {
 
     // Chunked generation
     useChunked: true,
+    reflowText: true,  // reflow soft-wrapped input to fill the width (keeps blank-line paragraph breaks)
     adaptiveChunking: true,
     adaptiveStrategy: 'balanced',
     wordsPerChunk: '',
@@ -285,6 +286,7 @@ document.addEventListener('alpine:init', () => {
         manual_size_scale: (!this.autoSize && this.manualSizeScale) ? Number(this.manualSizeScale) : undefined,
         writing_size_mm: this.writingSizeMm ? Number(this.writingSizeMm) : undefined,
         use_chunked: this.useChunked,
+        reflow: this.reflowText,
         adaptive_chunking: this.adaptiveChunking,
         adaptive_strategy: this.adaptiveStrategy || undefined,
         words_per_chunk: this.wordsPerChunk ? Number(this.wordsPerChunk) : undefined,
@@ -665,6 +667,7 @@ document.addEventListener('alpine:init', () => {
       formData.append('wrap_ratio', this.wrapRatio || '');
       formData.append('wrap_utilization', this.wrapUtil || '');
       formData.append('use_chunked', this.useChunked ? 'true' : 'false');
+      formData.append('reflow', this.reflowText ? 'true' : 'false');
       formData.append('adaptive_chunking', this.adaptiveChunking ? 'true' : 'false');
       formData.append('adaptive_strategy', this.adaptiveStrategy || '');
       formData.append('words_per_chunk', this.wordsPerChunk || '');
@@ -823,6 +826,7 @@ document.addEventListener('alpine:init', () => {
           wrap_ratio: this.wrapRatio || null,
           wrap_utilization: this.wrapUtil || null,
           use_chunked: this.useChunked,
+          reflow: this.reflowText,
           adaptive_chunking: this.adaptiveChunking,
           adaptive_strategy: this.adaptiveStrategy || null,
           words_per_chunk: this.wordsPerChunk || null,
diff --git a/webapp/templates/index.html b/webapp/templates/index.html
index 2590284..cacabfd 100644
--- a/webapp/templates/index.html
+++ b/webapp/templates/index.html
@@ -383,10 +383,10 @@ <h3 class="card-title">Page Settings</h3>
                   <label for="writingSizeMm" class="bx--label label-with-tooltip">
                     Writing Size (mm)
                     <span class="tooltip-icon">
-                      <span class="tooltip-text">Target x-height (height of lowercase letters like a, e, o) in millimetres. ~4.5mm is normal pen handwriting; increase for larger writing. Used when Auto Size is on.</span>
+                      <span class="tooltip-text">Target x-height (height of lowercase letters like a, e, o) in millimetres. Leave blank for Auto: picks 2.5–7mm so the text naturally fills one page (short notes are written larger, long letters smaller). Set a value to lock the size exactly; ~4.5mm is normal pen handwriting.</span>
                     </span>
                   </label>
-                  <input id="writingSizeMm" class="bx--text-input" type="number" step="0.5" min="1" placeholder="4.5" x-model="writingSizeMm" :disabled="!autoSize" />
+                  <input id="writingSizeMm" class="bx--text-input" type="number" step="0.5" min="1" placeholder="Auto" x-model="writingSizeMm" :disabled="!autoSize" />
                 </div>
                 <div class="form-group">
                   <label for="manualSizeScale" class="bx--label label-with-tooltip">
@@ -457,6 +457,22 @@ <h3 class="card-title">Page Settings</h3>
                       Generate text in small chunks for better quality and longer lines
                     </small>
                   </div>
+                  <div class="form-group">
+                    <div class="label-with-tooltip label-tooltip-mb">
+                      <label class="bx--checkbox-wrapper m-0">
+                        <input id="reflowText" type="checkbox" class="bx--checkbox" x-model="reflowText" />
+                        <label for="reflowText" class="bx--checkbox-label">
+                          <span class="bx--checkbox-label-text">Reflow text to fill width (Recommended)</span>
+                        </label>
+                      </label>
+                      <span class="tooltip-icon">
+                        <span class="tooltip-text">Joins soft-wrapped lines so paragraphs fill the page width, keeping blank-line paragraph breaks. Uncheck to keep your exact line breaks.</span>
+                      </span>
+                    </div>
+                    <small class="helper-text">
+                      Fills the page width; uncheck to keep your exact line breaks
+                    </small>
+                  </div>
                   <div class="form-group">
                     <div class="label-with-tooltip label-tooltip-mb">
                       <label class="bx--checkbox-wrapper m-0">
diff --git a/webapp/utils/generation_utils.py b/webapp/utils/generation_utils.py
index aaf48df..f993885 100644
--- a/webapp/utils/generation_utils.py
+++ b/webapp/utils/generation_utils.py
@@ -10,6 +10,7 @@
 from webapp.utils.text_utils import (
     normalize_text_for_model,
     wrap_by_canvas,
+    reflow_paragraphs,
     parse_optional_list as _parse_optional_list,
     parse_margins as _parse_margins,
     map_sequence_to_wrapped as _map_sequence_to_wrapped,
@@ -119,6 +120,9 @@ def _parse_int(val, default=None):
     manual_size_scale = _parse_float(_get("manual_size_scale"), 1.0)
     # Target x-height in mm for natural sizing (None -> engine default ~4.5mm)
     writing_size_mm = _parse_float(_get("writing_size_mm"))
+    # Reflow soft-wrapped lines to fill the page width (preserving blank-line
+    # paragraph breaks). Default on; turn off to keep the input's exact line breaks.
+    reflow = _parse_bool(_get("reflow", "true"), True)
 
     # Character overrides
     character_override_collection_id = _parse_int(_get("character_override_collection_id"))
@@ -168,6 +172,7 @@ def _parse_int(val, default=None):
         "auto_size": auto_size,
         "manual_size_scale": manual_size_scale,
         "writing_size_mm": writing_size_mm,
+        "reflow": reflow,
         "character_override_collection_id": character_override_collection_id,
         "wrap_char_px": wrap_char_px,
         "wrap_ratio": wrap_ratio,
@@ -205,7 +210,13 @@ def generate_handwriting_to_file(
     """
     # Parse lines from text or lines parameter
     if params["text"] is not None:
-        lines_in = params["text"].splitlines() if isinstance(params["text"], str) else params["text"]
+        text_str = params["text"] if isinstance(params["text"], str) else "\n".join(str(x) for x in params["text"])
+        # Reflow soft-wrapped input so paragraphs fill the page width. Without this,
+        # a letter pasted with a hard line break every few words renders as a column
+        # of half-empty lines (the wrapper can't widen lines that arrive pre-broken).
+        if params.get("reflow", True):
+            text_str = reflow_paragraphs(text_str)
+        lines_in = text_str.splitlines()
     elif params["lines"] is not None:
         lines_in = params["lines"] if isinstance(params["lines"], list) else [params["lines"]]
     else:
diff --git a/webapp/utils/text_utils.py b/webapp/utils/text_utils.py
index 7ec5b52..c96da52 100644
--- a/webapp/utils/text_utils.py
+++ b/webapp/utils/text_utils.py
@@ -80,6 +80,34 @@ def normalize_text_for_model(s: str, override_chars: Optional[set] = None) -> st
     return out
 
 
+def reflow_paragraphs(text: str) -> str:
+    """Reflow soft-wrapped text so paragraphs fill the available width.
+
+    A run of consecutive non-blank lines is treated as a single paragraph and
+    joined with spaces (so the generator re-wraps it to the page width); blank
+    lines are preserved as paragraph separators. This prevents short, hard-wrapped
+    input (e.g. a letter pasted with a line break every few words) from rendering
+    as a column of half-empty lines. Turn it off to keep your exact line breaks.
+
+    Args:
+        text: Raw input text (may contain hard line breaks).
+
+    Returns:
+        Reflowed text: paragraphs joined to single lines, separated by one blank
+        line each.
+    """
+    if not text:
+        return text
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    paragraphs = re.split(r'\n[ \t]*\n', text)  # split on blank lines
+    out = []
+    for para in paragraphs:
+        joined = ' '.join(seg.strip() for seg in para.split('\n') if seg.strip())
+        if joined:
+            out.append(joined)
+    return '\n\n'.join(out)
+
+
 def wrap_by_canvas(
     raw_lines: List[str],
     content_width_px: float,

From 9b7c06841995e656c74e0b5114989b53f12d0744 Mon Sep 17 00:00:00 2001
From: ariedotcodotnz <ariedotcodotnz@users.noreply.github.com>
Date: Wed, 10 Jun 2026 04:27:55 +0000
Subject: [PATCH 21/21] fix(auto-sizing): handle total_raw_width calculation
 for improved writing size estimation

---
 handwriting_synthesis/hand/Hand.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/handwriting_synthesis/hand/Hand.py b/handwriting_synthesis/hand/Hand.py
index 7f24a0f..b74f395 100644
--- a/handwriting_synthesis/hand/Hand.py
+++ b/handwriting_synthesis/hand/Hand.py
@@ -437,7 +437,7 @@ def _adaptive_chunk_chars(
 
     def _auto_fill_writing_size(
         self, chunk_strokes, n_blank_lines, page_size, units, margins,
-        orientation, x_stretch, model_xheight=None,
+        orientation, x_stretch, model_xheight=None, total_raw_width=None,
     ):
         """Pick a writing size (mm) so the text fills the page vertically.
 
@@ -458,10 +458,11 @@ def _auto_fill_writing_size(
             if not model_xheight:
                 return None
             content_w, content_h = self._content_box_px(page_size, units, margins, orientation)
-            total_raw_width = sum(
-                get_stroke_width(s) for s in chunk_strokes
-                if s is not None and len(s) > 0
-            )
+            if total_raw_width is None:
+                total_raw_width = sum(
+                    get_stroke_width(s) for s in chunk_strokes
+                    if s is not None and len(s) > 0
+                )
             xs = float(x_stretch) if x_stretch else 1.0
             if xs <= 0:
                 xs = 1.0
@@ -850,15 +851,21 @@ def write_chunked(
             # simply written larger. Long texts stay at the base natural size.
             all_strokes_flat = [s for entry in sampled_lines if entry for s in entry[1]]
             n_blank_lines = sum(1 for entry in sampled_lines if entry is None)
-            # Measure the x-height on line-sized STITCHED groups -- the statistic
-            # _draw actually scales by -- so wrap-time predictions match the
-            # rendered size and lines reach the right margin.
-            stitched_xheight = self._estimate_stitched_xheight(
+            # Measure x-height and width inflation on line-sized STITCHED groups
+            # -- the statistics _draw actually renders with -- so wrap-time
+            # predictions match the rendered output.
+            stitched_xheight, stitch_width_factor = self._estimate_stitched_xheight(
                 sampled_lines, chunk_spacing, rotate_chunks)
             if auto_size and writing_size_mm is None and all_strokes_flat:
+                # Stitched lines come out wider than the sum of their chunks, so
+                # the text effectively occupies stitch_width_factor more width
+                # when the solver estimates how many lines it will wrap into.
+                total_w = sum(get_stroke_width(s) for s in all_strokes_flat
+                              if s is not None and len(s) > 0)
                 fitted_mm = self._auto_fill_writing_size(
                     all_strokes_flat, n_blank_lines, page_size, units, margins,
                     orientation, x_stretch, model_xheight=stitched_xheight,
+                    total_raw_width=total_w * stitch_width_factor,
                 )
                 if fitted_mm:
                     effective_writing_size_mm = fitted_mm
@@ -868,6 +875,11 @@ def write_chunked(
                 orientation, effective_writing_size_mm, x_stretch, auto_size,
                 model_xheight=stitched_xheight,
             )
+            # Deflate the budget by the stitch widening: the DP below compares it
+            # against SUMS of chunk widths, but the stitched line will measure
+            # stitch_width_factor wider -- without this the widest lines overrun
+            # the page and the width clamp shrinks the writing size to fit them.
+            effective_max_line_width /= stitch_width_factor
             # Allow a squeeze past the wrap limit: a writer fits one more word by
             # tightening slightly rather than leaving a ragged gap. _draw condenses
             # such lines by the same few percent per line (line_scale_x).