From 0aa0fc8255b55de9fb77d62427d0fb48af852146 Mon Sep 17 00:00:00 2001
From: hcphat <hcphat@tma.com.vn>
Date: Fri, 22 May 2026 16:36:38 +0700
Subject: [PATCH 1/4] =?UTF-8?q?Ref=202.1.IdP=E3=81=8B=E3=82=89=E5=8F=97?=
 =?UTF-8?q?=E9=A0=98=E3=81=99=E3=82=8B=E8=AA=8D=E8=A8=BC=E6=83=85=E5=A0=B1?=
 =?UTF-8?q?=E8=A1=A8=E7=A4=BA=E3=81=AE=E6=97=A5=E6=9C=AC=E8=AA=9E=E6=96=87?=
 =?UTF-8?q?=E5=AD=97=E5=8C=96=E3=81=91=E8=A7=A3=E6=B6=88:=20commit=20code?=
 =?UTF-8?q?=20fix=20japanese=20encoding=20error?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 admin/common_auth/views.py            |   4 +-
 admin_tests/common_auth/test_views.py | 310 +++++++++++++++++++++++++-
 2 files changed, 310 insertions(+), 4 deletions(-)

diff --git a/admin/common_auth/views.py b/admin/common_auth/views.py
index 4df69bfabac..5724b32d47c 100644
--- a/admin/common_auth/views.py
+++ b/admin/common_auth/views.py
@@ -109,7 +109,9 @@ def dispatch(self, request, *args, **kwargs):
                 return redirect('auth:login')
             else:
                 tmp_eppn = ('tmp_eppn_' + eppn).lower()
-                new_user, created = get_or_create_user(request.environ['HTTP_AUTH_DISPLAYNAME'] or 'NO NAME', tmp_eppn, reset_password=False)
+                raw_display_name = request.environ.get('HTTP_AUTH_DISPLAYNAME', '')
+                display_name = raw_display_name.encode('iso-8859-1').decode('utf-8') if raw_display_name else ''
+                new_user, created = get_or_create_user(display_name or 'NO NAME', tmp_eppn, reset_password=False)
                 USE_EPPN = login_by_eppn()
                 if USE_EPPN:
                     new_user.eppn = eppn
diff --git a/admin_tests/common_auth/test_views.py b/admin_tests/common_auth/test_views.py
index 35e9a5c097a..6ea2d1173b4 100644
--- a/admin_tests/common_auth/test_views.py
+++ b/admin_tests/common_auth/test_views.py
@@ -3,13 +3,15 @@
 
 from django.test import RequestFactory
 from django.http import Http404
+from django.urls import reverse
+from django.contrib.auth import REDIRECT_FIELD_NAME
 from tests.base import AdminTestCase
-from osf_tests.factories import AuthUserFactory
+from osf_tests.factories import AuthUserFactory, InstitutionFactory
 
-from admin_tests.utilities import setup_form_view
+from admin_tests.utilities import setup_form_view, setup_view
 
 from osf.models.user import OSFUser
-from admin.common_auth.views import RegisterUser
+from admin.common_auth.views import RegisterUser, ShibLoginView
 from admin.common_auth.forms import UserRegistrationForm
 
 
@@ -40,3 +42,305 @@ def test_add_user(self, mock_save):
         view.form_valid(form)
         nt.assert_true(mock_save.called)
         nt.assert_equal(OSFUser.objects.count(), count + 1)
+
+class TestShibLoginView(AdminTestCase):
+    """
+    Test ShibLoginView.dispatch and get_success_url.
+    """
+
+    EPPN_DOMAIN = 'example.ac.jp'
+    EPPN = 'testuser@' + EPPN_DOMAIN
+    ENTITLEMENT_ADMIN = 'GakuNinRDMAdmin'
+    DISPLAY_NAME = 'Test User'
+
+    def setUp(self):
+        super(TestShibLoginView, self).setUp()
+        self.institution = InstitutionFactory(domains=[self.EPPN_DOMAIN])
+
+    def _make_request(self, eppn=None, entitlement='', displayname=None):
+        """Helper: create a GET request with Shibboleth environ headers."""
+        eppn = eppn if eppn is not None else self.EPPN
+        displayname = displayname or self.DISPLAY_NAME
+        request = RequestFactory().get('fake_path')
+        request.environ['HTTP_AUTH_EPPN'] = eppn
+        request.environ['HTTP_AUTH_ENTITLEMENT'] = entitlement
+        request.environ['HTTP_AUTH_DISPLAYNAME'] = displayname
+        return request
+
+    # ------------------------------------------------------------------
+    # No institution found for the eppn domain
+    # ------------------------------------------------------------------
+    def test_no_institution_redirects_to_login(self):
+        request = self._make_request(eppn='user@unknown.domain.jp')
+        view = setup_view(ShibLoginView(), request)
+        response = view.dispatch(request)
+        nt.assert_equal(response.status_code, 302)
+        nt.assert_in('login', response.url)
+
+    # ------------------------------------------------------------------
+    # eppn is empty string but institution lookup is mocked to
+    #           return a result (otherwise empty domain fails at branch A)
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.messages.error')
+    def test_empty_eppn_redirects_to_login(self, mock_error):
+        request = self._make_request(eppn='')
+        with mock.patch('admin.common_auth.views.Institution') as mock_inst_cls:
+            mock_inst_cls.objects.filter.return_value.first.return_value = self.institution
+            view = setup_view(ShibLoginView(), request)
+            response = view.dispatch(request)
+        nt.assert_equal(response.status_code, 302)
+        nt.assert_in('login', response.url)
+        nt.assert_true(mock_error.called)
+
+    # ------------------------------------------------------------------
+    # Existing user + GakuNinRDMAdmin entitlement → login
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    def test_existing_user_with_admin_entitlement_logs_in(self, mock_login, mock_check, mock_keygen):
+        user = AuthUserFactory()
+        user.eppn = self.EPPN
+        user.save()
+        request = self._make_request(entitlement=self.ENTITLEMENT_ADMIN)
+        view = setup_view(ShibLoginView(), request)
+        response = view.dispatch(request)
+        user.refresh_from_db()
+        nt.assert_true(user.is_staff)
+        nt.assert_true(mock_login.called)
+        nt.assert_true(mock_keygen.called)
+        nt.assert_equal(response.status_code, 302)
+
+    # ------------------------------------------------------------------
+    # Existing user + no admin entitlement → redirect error
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.messages.error')
+    def test_existing_user_without_admin_entitlement_redirects(self, mock_error):
+        user = AuthUserFactory()
+        user.eppn = self.EPPN
+        user.save()
+        request = self._make_request(entitlement='SomeOtherEntitlement')
+        view = setup_view(ShibLoginView(), request)
+        response = view.dispatch(request)
+        user.refresh_from_db()
+        nt.assert_false(user.is_staff)
+        nt.assert_true(mock_error.called)
+        nt.assert_equal(response.status_code, 302)
+        nt.assert_in('login', response.url)
+
+    # ------------------------------------------------------------------
+    # New user (no eppn match) + no admin entitlement → redirect error
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.messages.error')
+    def test_new_user_without_admin_entitlement_redirects(self, mock_error):
+        request = self._make_request(
+            eppn='nouser@' + self.EPPN_DOMAIN,
+            entitlement='SomeOtherEntitlement',
+        )
+        view = setup_view(ShibLoginView(), request)
+        response = view.dispatch(request)
+        nt.assert_true(mock_error.called)
+        nt.assert_equal(response.status_code, 302)
+        nt.assert_in('login', response.url)
+
+    # ------------------------------------------------------------------
+    # New user + admin entitlement → user created, is_staff=True
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    def test_new_user_with_admin_entitlement_creates_and_logs_in(self, mock_login, mock_check, mock_keygen):
+        new_eppn = 'brandnew@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        count_before = OSFUser.objects.count()
+        view = setup_view(ShibLoginView(), request)
+        response = view.dispatch(request)
+        nt.assert_equal(OSFUser.objects.count(), count_before + 1)
+        new_user = OSFUser.objects.get(eppn=new_eppn)
+        nt.assert_true(new_user.is_staff)
+        nt.assert_false(new_user.have_email)
+        nt.assert_true(mock_login.called)
+        nt.assert_equal(response.status_code, 302)
+
+    # ------------------------------------------------------------------
+    # Existing user → other institutions removed, current added
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=True)
+    @mock.patch('admin.common_auth.views.login')
+    def test_existing_user_institution_updated(self, mock_login, mock_check, mock_keygen):
+        other_institution = InstitutionFactory()
+        user = AuthUserFactory()
+        user.eppn = self.EPPN
+        user.affiliated_institutions.add(other_institution)
+        user.save()
+        request = self._make_request(entitlement=self.ENTITLEMENT_ADMIN)
+        view = setup_view(ShibLoginView(), request)
+        view.dispatch(request)
+        user.refresh_from_db()
+        nt.assert_false(
+            user.affiliated_institutions.filter(id=other_institution.id).exists()
+        )
+        nt.assert_true(
+            user.affiliated_institutions.filter(id=self.institution.id).exists()
+        )
+
+    # ------------------------------------------------------------------
+    # userkey_generation_check=True → userkey_generation NOT called
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=True)
+    @mock.patch('admin.common_auth.views.login')
+    def test_userkey_not_regenerated_when_exists(self, mock_login, mock_check, mock_keygen):
+        user = AuthUserFactory()
+        user.eppn = self.EPPN
+        user.save()
+        request = self._make_request(entitlement=self.ENTITLEMENT_ADMIN)
+        view = setup_view(ShibLoginView(), request)
+        view.dispatch(request)
+        nt.assert_false(mock_keygen.called)
+
+    # ------------------------------------------------------------------
+    # get_success_url with no param → reverse('home')
+    # ------------------------------------------------------------------
+    def test_get_success_url_defaults_to_home(self):
+        request = RequestFactory().get('fake_path')
+        view = setup_view(ShibLoginView(), request)
+        nt.assert_equal(view.get_success_url(), reverse('home'))
+
+    # '/' param also falls back to home
+    def test_get_success_url_with_slash_defaults_to_home(self):
+        request = RequestFactory().get('fake_path', {REDIRECT_FIELD_NAME: '/'})
+        view = setup_view(ShibLoginView(), request)
+        nt.assert_equal(view.get_success_url(), reverse('home'))
+
+    # ------------------------------------------------------------------
+    # get_success_url with custom redirect param
+    # ------------------------------------------------------------------
+    def test_get_success_url_uses_redirect_param(self):
+        request = RequestFactory().get('fake_path', {REDIRECT_FIELD_NAME: '/admin/nodes/'})
+        view = setup_view(ShibLoginView(), request)
+        nt.assert_equal(view.get_success_url(), '/admin/nodes/')
+
+    # ------------------------------------------------------------------
+    # New user + admin entitlement + USE_EPPN=True
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    @mock.patch('admin.common_auth.views.login_by_eppn', return_value=True)
+    def test_new_user_use_eppn_true(self, mock_use_eppn, mock_login, mock_check, mock_keygen):
+        new_eppn = 'eppntrue@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        view = setup_view(ShibLoginView(), request)
+        response = view.dispatch(request)
+        new_user = OSFUser.objects.get(eppn=new_eppn)
+        nt.assert_true(new_user.is_staff)
+        nt.assert_false(new_user.have_email)
+        nt.assert_equal(new_user.eppn, new_eppn)
+        nt.assert_equal(response.status_code, 302)
+
+    # ------------------------------------------------------------------
+    # New user + admin entitlement + USE_EPPN=False
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    @mock.patch('admin.common_auth.views.login_by_eppn', return_value=False)
+    def test_new_user_use_eppn_false(self, mock_use_eppn, mock_login, mock_check, mock_keygen):
+        new_eppn = 'eppnfalse@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        view = setup_view(ShibLoginView(), request)
+        response = view.dispatch(request)
+        # Lines 120-121 override the else-branch values, so final state is same
+        new_user = OSFUser.objects.get(eppn=new_eppn)
+        nt.assert_true(new_user.is_staff)
+        nt.assert_false(new_user.have_email)
+        nt.assert_equal(new_user.eppn, new_eppn)
+        nt.assert_equal(response.status_code, 302)
+
+    # ------------------------------------------------------------------
+    # Japanese displayname: Apache/WSGI passes UTF-8 bytes as Latin-1 (mojibake).
+    # Fix encodes back to iso-8859-1 then decodes as utf-8 → original Japanese.
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    @mock.patch('admin.common_auth.views.login_by_eppn', return_value=True)
+    def test_new_user_japanese_displayname_decoded_correctly(
+            self, mock_use_eppn, mock_login, mock_check, mock_keygen):
+        japanese_name = '山田 太郎'
+        # Simulate WSGI: UTF-8 bytes of the Japanese string interpreted as Latin-1
+        mojibake = japanese_name.encode('utf-8').decode('latin-1')
+
+        new_eppn = 'jauser@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        request.environ['HTTP_AUTH_DISPLAYNAME'] = mojibake
+
+        view = setup_view(ShibLoginView(), request)
+        view.dispatch(request)
+
+        new_user = OSFUser.objects.get(eppn=new_eppn)
+        nt.assert_equal(new_user.fullname, japanese_name)
+
+    # ------------------------------------------------------------------
+    # Multi-byte Japanese name with organization prefix (real-world case).
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    @mock.patch('admin.common_auth.views.login_by_eppn', return_value=True)
+    def test_new_user_japanese_fullwidth_displayname_decoded_correctly(
+            self, mock_use_eppn, mock_login, mock_check, mock_keygen):
+        japanese_name = '国立情報学研究所　鈴木一郎'
+        mojibake = japanese_name.encode('utf-8').decode('latin-1')
+
+        new_eppn = 'jafull@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        request.environ['HTTP_AUTH_DISPLAYNAME'] = mojibake
+
+        view = setup_view(ShibLoginView(), request)
+        view.dispatch(request)
+
+        new_user = OSFUser.objects.get(eppn=new_eppn)
+        nt.assert_equal(new_user.fullname, japanese_name)
+
+    # ------------------------------------------------------------------
+    # Empty displayname → falls back to 'NO NAME'
+    # covers: display_name = '' → get_or_create_user('' or 'NO NAME', ...)
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    @mock.patch('admin.common_auth.views.login_by_eppn', return_value=True)
+    def test_new_user_empty_displayname_falls_back_to_no_name(
+            self, mock_use_eppn, mock_login, mock_check, mock_keygen):
+        new_eppn = 'noname@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        request.environ['HTTP_AUTH_DISPLAYNAME'] = ''  # override helper default
+
+        view = setup_view(ShibLoginView(), request)
+        view.dispatch(request)
+
+        new_user = OSFUser.objects.get(eppn=new_eppn)
+        nt.assert_equal(new_user.fullname, 'NO NAME')
+
+    # ------------------------------------------------------------------
+    # ASCII displayname passes through unchanged
+    # encode('iso-8859-1').decode('utf-8') on pure ASCII is identity
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.userkey_generation')
+    @mock.patch('admin.common_auth.views.userkey_generation_check', return_value=False)
+    @mock.patch('admin.common_auth.views.login')
+    @mock.patch('admin.common_auth.views.login_by_eppn', return_value=True)
+    def test_new_user_ascii_displayname_passes_through(
+            self, mock_use_eppn, mock_login, mock_check, mock_keygen):
+        new_eppn = 'ascii@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        request.environ['HTTP_AUTH_DISPLAYNAME'] = 'John Smith'
+
+        view = setup_view(ShibLoginView(), request)
+        view.dispatch(request)
+
+        new_user = OSFUser.objects.get(eppn=new_eppn)
+        nt.assert_equal(new_user.fullname, 'John Smith')

From 2358fd6b8b68fb8b616879dfb8f098fa5ffa351b Mon Sep 17 00:00:00 2001
From: hcphat <hcphat@tma.com.vn>
Date: Wed, 27 May 2026 10:06:15 +0700
Subject: [PATCH 2/4] =?UTF-8?q?Ref=202.1.IdP=E3=81=8B=E3=82=89=E5=8F=97?=
 =?UTF-8?q?=E9=A0=98=E3=81=99=E3=82=8B=E8=AA=8D=E8=A8=BC=E6=83=85=E5=A0=B1?=
 =?UTF-8?q?=E8=A1=A8=E7=A4=BA=E3=81=AE=E6=97=A5=E6=9C=AC=E8=AA=9E=E6=96=87?=
 =?UTF-8?q?=E5=AD=97=E5=8C=96=E3=81=91=E8=A7=A3=E6=B6=88:=20Update=20raw?=
 =?UTF-8?q?=5Fdisplay=5Fname=20and=20UT=20for=20KeyError?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 admin/common_auth/views.py            |  2 +-
 admin_tests/common_auth/test_views.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/admin/common_auth/views.py b/admin/common_auth/views.py
index 5724b32d47c..d2dc3993090 100644
--- a/admin/common_auth/views.py
+++ b/admin/common_auth/views.py
@@ -109,7 +109,7 @@ def dispatch(self, request, *args, **kwargs):
                 return redirect('auth:login')
             else:
                 tmp_eppn = ('tmp_eppn_' + eppn).lower()
-                raw_display_name = request.environ.get('HTTP_AUTH_DISPLAYNAME', '')
+                raw_display_name = request.environ['HTTP_AUTH_DISPLAYNAME']
                 display_name = raw_display_name.encode('iso-8859-1').decode('utf-8') if raw_display_name else ''
                 new_user, created = get_or_create_user(display_name or 'NO NAME', tmp_eppn, reset_password=False)
                 USE_EPPN = login_by_eppn()
diff --git a/admin_tests/common_auth/test_views.py b/admin_tests/common_auth/test_views.py
index 6ea2d1173b4..5bc23647714 100644
--- a/admin_tests/common_auth/test_views.py
+++ b/admin_tests/common_auth/test_views.py
@@ -344,3 +344,15 @@ def test_new_user_ascii_displayname_passes_through(
 
         new_user = OSFUser.objects.get(eppn=new_eppn)
         nt.assert_equal(new_user.fullname, 'John Smith')
+
+    # ------------------------------------------------------------------
+    # Missing HTTP_AUTH_DISPLAYNAME header → KeyError
+    # ------------------------------------------------------------------
+    @mock.patch('admin.common_auth.views.login_by_eppn', return_value=True)
+    def test_new_user_missing_displayname_header_raises_key_error(self, mock_use_eppn):
+        new_eppn = 'missing@' + self.EPPN_DOMAIN
+        request = self._make_request(eppn=new_eppn, entitlement=self.ENTITLEMENT_ADMIN)
+        del request.environ['HTTP_AUTH_DISPLAYNAME']
+        view = setup_view(ShibLoginView(), request)
+        with nt.assert_raises(KeyError):
+            view.dispatch(request)

From 48f466fa61e2335ac4610f258a38b3032aa82faf Mon Sep 17 00:00:00 2001
From: hcphat <hcphat@tma.com.vn>
Date: Fri, 29 May 2026 14:09:50 +0700
Subject: [PATCH 3/4] =?UTF-8?q?Ref=202.1.IdP=E3=81=8B=E3=82=89=E5=8F=97?=
 =?UTF-8?q?=E9=A0=98=E3=81=99=E3=82=8B=E8=AA=8D=E8=A8=BC=E6=83=85=E5=A0=B1?=
 =?UTF-8?q?=E8=A1=A8=E7=A4=BA=E3=81=AE=E6=97=A5=E6=9C=AC=E8=AA=9E=E6=96=87?=
 =?UTF-8?q?=E5=AD=97=E5=8C=96=E3=81=91=E8=A7=A3=E6=B6=88:=20Add=20script?=
 =?UTF-8?q?=20update=20record=20error=20encoding?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/fix_encoding_errors.py | 535 +++++++++++++++++++++++++++++++++
 1 file changed, 535 insertions(+)
 create mode 100644 scripts/fix_encoding_errors.py

diff --git a/scripts/fix_encoding_errors.py b/scripts/fix_encoding_errors.py
new file mode 100644
index 00000000000..11017c8355a
--- /dev/null
+++ b/scripts/fix_encoding_errors.py
@@ -0,0 +1,535 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+fix_encoding_errors.py
+
+Scan and fix Japanese encoding issues and HTML entities using Django ORM,
+then export the results directly to a CSV file.
+This script targets:
+  - UserExtendedData (field: data JSONB)
+  - OSFUser (fields: fullname, given_name, middle_names, family_name,
+                      suffix, given_name_ja, middle_names_ja, family_name_ja,
+                      department, jobs, schools, social)
+
+Usage:
+    # Scan, update encoding errors in DB using Django ORM, and export results to CSV
+    python -m scripts.fix_encoding_errors
+    python -m scripts.fix_encoding_errors --output result.csv
+
+    # Preview updates and export results to CSV without making changes to DB (dry run)
+    python -m scripts.fix_encoding_errors --dry
+    python -m scripts.fix_encoding_errors --dry --include-unrecoverable --output preview.csv
+
+    # Also apply partial fixes to records where encoding cannot be fully recovered
+    python -m scripts.fix_encoding_errors --include-unrecoverable
+"""
+
+import os
+import re
+import csv
+import json
+import logging
+import copy
+import argparse
+from datetime import datetime
+
+import django
+from django.db import transaction
+
+# Setup Django before importing models
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'api.base.settings')
+django.setup()
+
+from osf.models import UserExtendedData, OSFUser
+from scripts import utils as script_utils
+
+logger = logging.getLogger(__name__)
+
+# --- Patterns ---
+HTML_ENTITY_RE = re.compile(r'&#\d+;|&#x[0-9a-fA-F]+;')
+JAPANESE_ERROR_ENCODING_HINT_RE = re.compile(r'[\u00c0-\u00cf\u00e0-\u00ef\u00aa\u00ba]')
+_NUM_ENT_RE = re.compile(r'&#(\d+);|&#x([0-9a-fA-F]+);')
+
+UNRECOVERABLE_NOTICE = '[UNRECOVERABLE: missing bytes - partial result] '
+UNRECOVERABLE_CANNOT_FIX = '[UNRECOVERABLE: cannot fix]'
+
+OSFUSER_STRING_COLUMNS = [
+    'fullname',
+    'given_name',
+    'middle_names',
+    'family_name',
+    'suffix',
+    'given_name_ja',
+    'middle_names_ja',
+    'family_name_ja',
+    'department',
+]
+
+OSFUSER_JSON_COLUMNS = [
+    'jobs',
+    'schools',
+    'social',
+]
+
+RESULT_FIELDNAMES = [
+    'source_table', 'id', 'user_guid', 'user_id', 'field_path',
+    'issues', 'original_value', 'suggested_fix', 'fix_applied', 'value_after', 'fix_status',
+]
+
+# --- Helper Functions ---
+
+def _entities_as_latin1(s):
+    """Replace &#NNN; numeric entities with chr(N), keeping within Latin-1 range."""
+    def repl(m):
+        n = int(m.group(1)) if m.group(1) is not None else int(m.group(2), 16)
+        return chr(n) if n <= 255 else m.group(0)
+    return _NUM_ENT_RE.sub(repl, s)
+
+
+def try_fix_japanese_error_encoding(s):
+    """
+    Attempt to fix Japanese error encoding by re-encoding as latin-1 and decoding as utf-8.
+    Returns (fixed_string, is_partial):
+      - is_partial=True  : decode succeeded with errors='replace' (some bytes were lost)
+      - is_partial=False : decode succeeded cleanly
+      - (None, False)    : fix not possible or no improvement
+    """
+    partial = False
+    try:
+        fixed = s.encode('latin1').decode('utf-8')
+    except UnicodeDecodeError:
+        try:
+            fixed = s.encode('latin1').decode('utf-8', errors='replace')
+            partial = True
+        except Exception:
+            return None, False
+    except Exception:
+        return None, False
+
+    orig_bad = len(JAPANESE_ERROR_ENCODING_HINT_RE.findall(s))
+    fixed_bad = len(JAPANESE_ERROR_ENCODING_HINT_RE.findall(fixed))
+    if fixed_bad < orig_bad:
+        return fixed, partial
+    return None, False
+
+
+def detect_issues(s):
+    """Return list of issue labels found in string s."""
+    issues = []
+    if HTML_ENTITY_RE.search(s):
+        issues.append('html_entities')
+    if JAPANESE_ERROR_ENCODING_HINT_RE.search(s):
+        issues.append('japanese_error_encoding')
+    return issues
+
+
+def suggest_fix(s, issues):
+    """
+    Return (suggested_fix, is_partial) for the given string and detected issues.
+    Returns (None, False) if no fix is possible.
+    """
+    if 'html_entities' in issues:
+        fixed, partial = try_fix_japanese_error_encoding(_entities_as_latin1(s))
+        if fixed:
+            return fixed, partial
+    if 'japanese_error_encoding' in issues:
+        fixed, partial = try_fix_japanese_error_encoding(s)
+        if fixed:
+            return fixed, partial
+    return None, False
+
+
+def walk(obj, path=''):
+    """Recursively yield (path, str_value) for all string leaves in a JSON object."""
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            yield from walk(v, '{}.{}'.format(path, k) if path else k)
+    elif isinstance(obj, list):
+        for i, v in enumerate(obj):
+            yield from walk(v, '{}[{}]'.format(path, i))
+    elif isinstance(obj, str):
+        yield (path, obj)
+
+
+def compute_fix_status(suggested_fix):
+    """Return 'clean', 'partial', or 'unrecoverable' based on suggested_fix."""
+    if suggested_fix == UNRECOVERABLE_CANNOT_FIX:
+        return 'unrecoverable'
+    if suggested_fix.startswith(UNRECOVERABLE_NOTICE):
+        return 'partial'
+    return 'clean'
+
+
+def make_error_row(source_table, record_id, user_guid, user_id, path, val):
+    """Build a single error row dict from a string value that has issues."""
+    issues = detect_issues(val)
+    if not issues:
+        return None
+    fixed, is_partial = suggest_fix(val, issues)
+    if fixed is None:
+        suggested = UNRECOVERABLE_CANNOT_FIX
+    elif is_partial:
+        suggested = UNRECOVERABLE_NOTICE + fixed
+    else:
+        suggested = fixed
+    return {
+        'source_table':   source_table,
+        'id':             record_id,
+        'user_guid':      user_guid,
+        'user_id':        user_id,
+        'field_path':     path,
+        'issues':         ','.join(issues),
+        'original_value': val,
+        'suggested_fix':  suggested,
+        'fix_status':     compute_fix_status(suggested),
+    }
+
+
+def is_fixable(row, include_unrecoverable):
+    """Determine whether a row can/should be applied as a fix."""
+    sf = row['suggested_fix']
+    if sf == UNRECOVERABLE_CANNOT_FIX:
+        return False
+    if sf.startswith(UNRECOVERABLE_NOTICE):
+        return include_unrecoverable
+    return True
+
+
+def get_clean_fix(suggested_fix):
+    """Return the actual fix value (strip UNRECOVERABLE_NOTICE prefix if present)."""
+    if suggested_fix.startswith(UNRECOVERABLE_NOTICE):
+        return suggested_fix[len(UNRECOVERABLE_NOTICE):]
+    return suggested_fix
+
+
+def set_nested_value(obj, path, new_value):
+    """
+    Set a value inside a nested dict/list structure using a dot/bracket path string.
+    E.g. path='jobs[0].institution' -> obj['jobs'][0]['institution'] = new_value
+    Returns the modified obj (mutated in-place).
+    """
+    tokens = re.split(r'\.|\[(\d+)\]', path)
+    parts = []
+    for t in tokens:
+        if t is None or t == '':
+            continue
+        if t.isdigit():
+            parts.append(int(t))
+        else:
+            parts.append(t)
+
+    node = obj
+    for part in parts[:-1]:
+        node = node[part]
+    node[parts[-1]] = new_value
+    return obj
+
+
+# --- Scanning Operations ---
+
+def scan_userextendeddata():
+    """Scan UserExtendedData.data JSONB field for encoding issues."""
+    records = UserExtendedData.objects.select_related('user').all()
+
+    logger.info('Scanning UserExtendedData records...')
+    rows = []
+    for record in records.iterator():
+        data = record.data or {}
+        user_guid = record.user._id if record.user else 'N/A'
+        user_id = record.user.id if record.user else 'N/A'
+
+        for path, val in walk(data):
+            row = make_error_row('osf_userextendeddata', record.id, user_guid, user_id, path, val)
+            if row:
+                rows.append(row)
+    logger.info('UserExtendedData scan completed. Found {} error(s).'.format(len(rows)))
+    return rows
+
+
+def scan_osfuser():
+    """Scan OSFUser string and JSON columns for encoding issues."""
+    records = OSFUser.objects.all()
+
+    logger.info('Scanning OSFUser records...')
+    rows = []
+    for user in records.iterator():
+        user_id = user.id
+        user_guid = user._id or 'N/A'
+
+        # Simple string columns
+        for col in OSFUSER_STRING_COLUMNS:
+            val = getattr(user, col, None)
+            if not val:
+                continue
+            row = make_error_row('osf_osfuser', user_id, user_guid, user_id, col, val)
+            if row:
+                rows.append(row)
+
+        # JSON columns
+        for col in OSFUSER_JSON_COLUMNS:
+            data = getattr(user, col, None)
+            if not data:
+                continue
+            for path, val in walk(data):
+                row = make_error_row('osf_osfuser', user_id, user_guid, user_id,
+                                     '{}.{}'.format(col, path), val)
+                if row:
+                    rows.append(row)
+    logger.info('OSFUser scan completed. Found {} error(s).'.format(len(rows)))
+    return rows
+
+
+def print_errors(rows):
+    """Display scanned encoding errors in a formatted table."""
+    if not rows:
+        print('   No encoding errors found.')
+        return
+
+    print('\n' + '-' * 180)
+    print(' {:<22} | {:>8} | {:>10} | {:<12} | {:<30} | {:<25} | {:<35} | {}'.format(
+        'SOURCE', 'ID', 'USER_ID', 'USER_GUID', 'FIELD_PATH', 'ISSUES', 'ORIGINAL_VALUE', 'SUGGESTED_FIX'
+    ))
+    print('-' * 180)
+    for r in rows:
+        orig_val = r['original_value'] or ''
+        suggested = r['suggested_fix'] or ''
+        val_display = orig_val[:33] + '...' if len(orig_val) > 33 else orig_val
+        fix_display = suggested[:50] + '...' if len(suggested) > 50 else suggested
+        print(
+            ' {:<22} | {:>8} | {:>10} | {:<12} | '
+            '{:<30} | {:<25} | {:<35} | {}'.format(
+                r['source_table'] or '',
+                str(r['id']) if r['id'] is not None else '',
+                str(r['user_id']) if r['user_id'] is not None else '',
+                str(r['user_guid']) if r['user_guid'] is not None else '',
+                (r['field_path'] or '')[:30],
+                r['issues'] or '',
+                val_display,
+                fix_display
+            )
+        )
+    print('-' * 180)
+
+    # Summary
+    by_table = {}
+    for r in rows:
+        by_table[r['source_table']] = by_table.get(r['source_table'], 0) + 1
+    for table, count in sorted(by_table.items()):
+        print('   {}: {} error(s)'.format(table, count))
+    print(' Total: {} error(s)'.format(len(rows)))
+
+
+def save_result_csv(result_rows, output_path):
+    """Save full results with before/after status to CSV."""
+    with open(output_path, 'w', newline='', encoding='utf-8-sig') as f:
+        writer = csv.DictWriter(f, fieldnames=RESULT_FIELDNAMES)
+        writer.writeheader()
+        writer.writerows(result_rows)
+    print('\n Exported {} result(s) to: {}'.format(len(result_rows), output_path))
+
+
+# --- Updating Operations ---
+
+def apply_fixes(rows, include_unrecoverable, dry_run=False):
+    """
+    Apply fixes to database using Django ORM inside a transaction.
+    If dry_run is True, the transaction will be rolled back at the end.
+    Returns list of result rows with fix status.
+    """
+    # Group changes by model instance to minimize database writes
+    ued_updates = {}
+    user_updates = {}
+
+    for row in rows:
+        if row['source_table'] == 'osf_userextendeddata':
+            ued_updates.setdefault(row['id'], []).append(row)
+        elif row['source_table'] == 'osf_osfuser':
+            user_updates.setdefault(row['id'], []).append(row)
+
+    result_rows = []
+    fixable_count = 0
+    skipped_count = 0
+
+    if dry_run:
+        logger.info('[DRY RUN] Simulating database updates inside a transaction...')
+    else:
+        logger.info('Starting database transaction to apply fixes...')
+
+    try:
+        with transaction.atomic():
+            # 1. Update UserExtendedData instances
+            for record_id, ued_rows in ued_updates.items():
+                try:
+                    record = UserExtendedData.objects.get(id=record_id)
+                except UserExtendedData.DoesNotExist:
+                    for row in ued_rows:
+                        result = dict(row)
+                        result['fix_applied'] = 'NO (RECORD NOT EXIST)'
+                        result['value_after'] = row['original_value']
+                        result_rows.append(result)
+                        skipped_count += 1
+                        logger.warning('   Skipped: [UserExtendedData] id={} path={} (record not exist)'.format(record_id, row['field_path']))
+                    continue
+
+                data = record.data or {}
+                has_changes = False
+
+                for row in ued_rows:
+                    result = dict(row)
+                    result['fix_applied'] = ''
+                    result['value_after'] = ''
+
+                    should_fix = is_fixable(row, include_unrecoverable)
+
+                    if should_fix:
+                        clean_fix = get_clean_fix(row['suggested_fix'])
+                        data = set_nested_value(data, row['field_path'], clean_fix)
+                        has_changes = True
+                        result['fix_applied'] = 'YES'
+                        result['value_after'] = clean_fix
+                        fixable_count += 1
+                        logger.info('   Fixed: [UserExtendedData] id={} path={}'.format(record_id, row['field_path']))
+                    else:
+                        result['fix_applied'] = 'NO (UNRECOVERABLE)'
+                        result['value_after'] = row['original_value']
+                        skipped_count += 1
+
+                    result_rows.append(result)
+
+                if has_changes:
+                    # Deepcopy to guarantee DirtyFieldsMixin registers the change
+                    record.data = copy.deepcopy(data)
+                    record.save()
+
+            # 2. Update OSFUser instances
+            for user_id, u_rows in user_updates.items():
+                try:
+                    user = OSFUser.objects.get(id=user_id)
+                except OSFUser.DoesNotExist:
+                    for row in u_rows:
+                        result = dict(row)
+                        result['fix_applied'] = 'NO (RECORD NOT EXIST)'
+                        result['value_after'] = row['original_value']
+                        result_rows.append(result)
+                        skipped_count += 1
+                        logger.warning('   Skipped: [OSFUser] id={} field={} (record not exist)'.format(user_id, row['field_path']))
+                    continue
+
+                has_changes = False
+
+                for row in u_rows:
+                    result = dict(row)
+                    result['fix_applied'] = ''
+                    result['value_after'] = ''
+
+                    should_fix = is_fixable(row, include_unrecoverable)
+
+                    if should_fix:
+                        clean_fix = get_clean_fix(row['suggested_fix'])
+                        path = row['field_path']
+                        top_col = path.split('.')[0].split('[')[0]
+
+                        if top_col in OSFUSER_STRING_COLUMNS:
+                            setattr(user, top_col, clean_fix)
+                            has_changes = True
+                            result['fix_applied'] = 'YES'
+                            result['value_after'] = clean_fix
+                            fixable_count += 1
+                            logger.info('   Fixed: [OSFUser] id={} field={}'.format(user_id, path))
+                        elif top_col in OSFUSER_JSON_COLUMNS:
+                            col_data = getattr(user, top_col, None) or (dict() if top_col == 'social' else list())
+                            sub_path = path[len(top_col):]
+                            if sub_path.startswith('.'):
+                                sub_path = sub_path[1:]
+
+                            col_data = set_nested_value(col_data, sub_path, clean_fix)
+                            setattr(user, top_col, copy.deepcopy(col_data))
+                            has_changes = True
+                            result['fix_applied'] = 'YES'
+                            result['value_after'] = clean_fix
+                            fixable_count += 1
+                            logger.info('   Fixed: [OSFUser] id={} field={}'.format(user_id, path))
+                    else:
+                        result['fix_applied'] = 'NO (UNRECOVERABLE)'
+                        result['value_after'] = row['original_value']
+                        skipped_count += 1
+
+                    result_rows.append(result)
+
+                if has_changes:
+                    user.save()
+
+            if dry_run:
+                # Rollback transaction so no database changes are saved
+                transaction.set_rollback(True)
+                logger.info('[DRY RUN] Transaction rolled back successfully. No changes saved.')
+            else:
+                logger.info('Transaction committed successfully.')
+
+        if not dry_run:
+            logger.info('Applied: {} fix(es), Skipped: {} record(s)'.format(fixable_count, skipped_count))
+        else:
+            logger.info('[DRY RUN] Would apply {} fix(es) and skip {} record(s)'.format(fixable_count, skipped_count))
+
+    except Exception as e:
+        logger.exception('Error during transaction execution - rolled back.')
+        raise
+
+    return result_rows
+
+
+# --- Main ---
+
+def main():
+    ts = datetime.now().strftime('%Y%m%d_%H%M%S')
+    default_output = f'fix_result_{ts}.csv'
+
+    parser = argparse.ArgumentParser(
+        description='Scan and fix encoding errors in UserExtendedData and OSFUser using Django ORM.',
+    )
+    parser.add_argument(
+        '--output', '-o',
+        default=default_output,
+        help=f'Output CSV file path (default: {default_output})',
+    )
+    parser.add_argument(
+        '--include-unrecoverable', action='store_true', default=False,
+        help='Also apply fixes for records marked as unrecoverable/partial result.',
+    )
+    parser.add_argument(
+        '--dry', action='store_true', default=False,
+        help='Run script in dry-run mode (do not commit updates to the database).',
+    )
+    args = parser.parse_args()
+
+    if not args.dry:
+        # For actual runs modifying data, set up file logging
+        script_utils.add_file_logger(logger, __file__)
+
+    # Configure root/console logging format
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s %(levelname)s %(message)s',
+    )
+
+    # 1. Scan errors from database
+    rows = []
+    rows.extend(scan_userextendeddata())
+    rows.extend(scan_osfuser())
+
+    if not rows:
+        print('No encoding errors found.')
+        return
+
+    # Print the table of found errors
+    print_errors(rows)
+
+    # 2. Apply fixes (conditionally committing or rolling back)
+    result_rows = apply_fixes(rows, args.include_unrecoverable, dry_run=args.dry)
+
+    # 3. Export results to CSV
+    save_result_csv(result_rows, args.output)
+
+
+if __name__ == '__main__':
+    main()

From 06e7f23c1c70467bff392278164690b4b5b27cb3 Mon Sep 17 00:00:00 2001
From: hcphat <hcphat@tma.com.vn>
Date: Fri, 26 Jun 2026 18:28:26 +0700
Subject: [PATCH 4/4] =?UTF-8?q?Ref=202.1.IdP=E3=81=8B=E3=82=89=E5=8F=97?=
 =?UTF-8?q?=E9=A0=98=E3=81=99=E3=82=8B=E8=AA=8D=E8=A8=BC=E6=83=85=E5=A0=B1?=
 =?UTF-8?q?=E8=A1=A8=E7=A4=BA=E3=81=AE=E6=97=A5=E6=9C=AC=E8=AA=9E=E6=96=87?=
 =?UTF-8?q?=E5=AD=97=E5=8C=96=E3=81=91=E8=A7=A3=E6=B6=88:=20Update=20scrip?=
 =?UTF-8?q?t=20support=20input=20csv?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/fix_encoding_errors.py | 397 ++++++++++++++++++++++++++++-----
 1 file changed, 342 insertions(+), 55 deletions(-)

diff --git a/scripts/fix_encoding_errors.py b/scripts/fix_encoding_errors.py
index 11017c8355a..bfe9b020039 100644
--- a/scripts/fix_encoding_errors.py
+++ b/scripts/fix_encoding_errors.py
@@ -12,18 +12,16 @@
                       department, jobs, schools, social)
 
 Usage:
-    # Scan, update encoding errors in DB using Django ORM, and export results to CSV
-    python -m scripts.fix_encoding_errors
-    python -m scripts.fix_encoding_errors --output result.csv
-
     # Preview updates and export results to CSV without making changes to DB (dry run)
     python -m scripts.fix_encoding_errors --dry
-    python -m scripts.fix_encoding_errors --dry --include-unrecoverable --output preview.csv
+    python -m scripts.fix_encoding_errors --dry --output preview.csv
 
-    # Also apply partial fixes to records where encoding cannot be fully recovered
-    python -m scripts.fix_encoding_errors --include-unrecoverable
+    # Apply fixes using a previously reviewed CSV file
+    python -m scripts.fix_encoding_errors --input-csv preview.csv
+    python -m scripts.fix_encoding_errors --input-csv preview.csv --output result.csv
 """
 
+import sys
 import os
 import re
 import csv
@@ -35,6 +33,7 @@
 
 import django
 from django.db import transaction
+from django.core.exceptions import ValidationError as DjangoValidationError
 
 # Setup Django before importing models
 os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'api.base.settings')
@@ -53,6 +52,13 @@
 UNRECOVERABLE_NOTICE = '[UNRECOVERABLE: missing bytes - partial result] '
 UNRECOVERABLE_CANNOT_FIX = '[UNRECOVERABLE: cannot fix]'
 
+# Unicode replacement character (U+FFFD). It is produced when a partial decode
+# (errors='replace') drops missing bytes. It is a VALID character for the target
+# columns, so Django model constraints will NOT reject it on save(). We must
+# detect it explicitly: a value that still contains it has not been fully
+# recovered and must never be written to the DB.
+REPLACEMENT_CHAR = '�'
+
 OSFUSER_STRING_COLUMNS = [
     'fullname',
     'given_name',
@@ -101,9 +107,15 @@ def try_fix_japanese_error_encoding(s):
         try:
             fixed = s.encode('latin1').decode('utf-8', errors='replace')
             partial = True
-        except Exception:
+        except UnicodeError:
+            # Expected encoding failure -> this string cannot be fixed by the
+            # latin1->utf-8 round-trip. Return None so the caller marks it as
+            # unrecoverable and continues. Any other (unexpected) exception is
+            # left to propagate, per fail-fast.
             return None, False
-    except Exception:
+    except UnicodeError:
+        # s contains characters outside Latin-1 (e.g. real Japanese mixed with
+        # mojibake): this fix method does not apply. Treat as "cannot fix".
         return None, False
 
     orig_bad = len(JAPANESE_ERROR_ENCODING_HINT_RE.findall(s))
@@ -185,16 +197,6 @@ def make_error_row(source_table, record_id, user_guid, user_id, path, val):
     }
 
 
-def is_fixable(row, include_unrecoverable):
-    """Determine whether a row can/should be applied as a fix."""
-    sf = row['suggested_fix']
-    if sf == UNRECOVERABLE_CANNOT_FIX:
-        return False
-    if sf.startswith(UNRECOVERABLE_NOTICE):
-        return include_unrecoverable
-    return True
-
-
 def get_clean_fix(suggested_fix):
     """Return the actual fix value (strip UNRECOVERABLE_NOTICE prefix if present)."""
     if suggested_fix.startswith(UNRECOVERABLE_NOTICE):
@@ -225,6 +227,46 @@ def set_nested_value(obj, path, new_value):
     return obj
 
 
+def _get_nested_value(obj, path):
+    """
+    Read a value from a nested dict/list structure using a dot/bracket path string.
+    Mirror of set_nested_value — raises KeyError/IndexError if path does not exist.
+    """
+    tokens = re.split(r'\.|\[(\d+)\]', path)
+    parts = []
+    for t in tokens:
+        if t is None or t == '':
+            continue
+        parts.append(int(t) if t.isdigit() else t)
+    node = obj
+    for part in parts:
+        node = node[part]
+    return node
+
+
+def _get_osfuser_field_value(user, field_path):
+    """
+    Read the current value from an OSFUser instance at the given field_path.
+    Supports both simple string columns and nested JSON columns.
+    Raises ValueError for unknown top-level columns.
+    """
+    top_col = field_path.split('.')[0].split('[')[0]
+    if top_col in OSFUSER_STRING_COLUMNS:
+        return getattr(user, top_col)
+    if top_col in OSFUSER_JSON_COLUMNS:
+        # A NULL/empty JSON column is a VALID data state (the user simply has no
+        # social/jobs/schools entries), NOT a data defect to fail-fast on. Default
+        # to an empty container so the nested read below behaves consistently.
+        col_data = getattr(user, top_col) or (dict() if top_col == 'social' else list())
+        sub_path = field_path[len(top_col):]
+        if sub_path.startswith('.'):
+            sub_path = sub_path[1:]
+        if not sub_path:
+            return col_data
+        return _get_nested_value(col_data, sub_path)
+    raise ValueError('Unknown field_path top column: {}'.format(top_col))
+
+
 # --- Scanning Operations ---
 
 def scan_userextendeddata():
@@ -234,9 +276,15 @@ def scan_userextendeddata():
     logger.info('Scanning UserExtendedData records...')
     rows = []
     for record in records.iterator():
+        if record.user is None:
+            raise ValueError('UserExtendedData id={} has no associated user'.format(record.id))
+        if not record.user._id:
+            raise ValueError('OSFUser id={} has no _id (guid)'.format(record.user.id))
+        # NULL/empty `data` is a VALID state (record with no extended data), not a
+        # defect: fall back to an empty dict so walk() simply yields nothing.
         data = record.data or {}
-        user_guid = record.user._id if record.user else 'N/A'
-        user_id = record.user.id if record.user else 'N/A'
+        user_guid = record.user._id
+        user_id = record.user.id
 
         for path, val in walk(data):
             row = make_error_row('osf_userextendeddata', record.id, user_guid, user_id, path, val)
@@ -253,12 +301,16 @@ def scan_osfuser():
     logger.info('Scanning OSFUser records...')
     rows = []
     for user in records.iterator():
+        if not user._id:
+            raise ValueError('OSFUser id={} has no _id (guid)'.format(user.id))
         user_id = user.id
-        user_guid = user._id or 'N/A'
+        user_guid = user._id
 
         # Simple string columns
         for col in OSFUSER_STRING_COLUMNS:
-            val = getattr(user, col, None)
+            if not hasattr(user, col):
+                raise ValueError('OSFUser model has no attribute "{}"'.format(col))
+            val = getattr(user, col)
             if not val:
                 continue
             row = make_error_row('osf_osfuser', user_id, user_guid, user_id, col, val)
@@ -267,7 +319,11 @@ def scan_osfuser():
 
         # JSON columns
         for col in OSFUSER_JSON_COLUMNS:
-            data = getattr(user, col, None)
+            if not hasattr(user, col):
+                raise ValueError('OSFUser model has no attribute "{}"'.format(col))
+            data = getattr(user, col)
+            # NULL/empty JSON column is a VALID state (user has no entries for this
+            # column), not a defect: nothing to scan, so skip without failing.
             if not data:
                 continue
             for path, val in walk(data):
@@ -330,7 +386,7 @@ def save_result_csv(result_rows, output_path):
 
 # --- Updating Operations ---
 
-def apply_fixes(rows, include_unrecoverable, dry_run=False):
+def apply_fixes(rows, dry_run=False, is_from_csv=False):
     """
     Apply fixes to database using Django ORM inside a transaction.
     If dry_run is True, the transaction will be rolled back at the end.
@@ -362,15 +418,10 @@ def apply_fixes(rows, include_unrecoverable, dry_run=False):
                 try:
                     record = UserExtendedData.objects.get(id=record_id)
                 except UserExtendedData.DoesNotExist:
-                    for row in ued_rows:
-                        result = dict(row)
-                        result['fix_applied'] = 'NO (RECORD NOT EXIST)'
-                        result['value_after'] = row['original_value']
-                        result_rows.append(result)
-                        skipped_count += 1
-                        logger.warning('   Skipped: [UserExtendedData] id={} path={} (record not exist)'.format(record_id, row['field_path']))
-                    continue
+                    raise ValueError('UserExtendedData id={} not found in DB'.format(record_id))
 
+                # NULL/empty `data` is a VALID state, not a defect; default to an
+                # empty dict so set_nested_value can build the path to update.
                 data = record.data or {}
                 has_changes = False
 
@@ -379,9 +430,25 @@ def apply_fixes(rows, include_unrecoverable, dry_run=False):
                     result['fix_applied'] = ''
                     result['value_after'] = ''
 
-                    should_fix = is_fixable(row, include_unrecoverable)
+                    if is_from_csv:
+                        should_fix = row.get('fix_applied', '').strip().upper() == 'YES'
+                    else:
+                        # Only auto-apply clean fixes. 'partial' results lost bytes
+                        # and must be reviewed by a human before applying.
+                        should_fix = row.get('fix_status') == 'clean'
 
                     if should_fix:
+                        if is_from_csv:
+                            current_val = _get_nested_value(data, row['field_path'])
+                            if current_val != row.get('original_value', ''):
+                                raise ValueError(
+                                    'UserExtendedData id={} field={}: original_value in CSV {!r} does not match '
+                                    'current DB value {!r}. DB may have been updated after export. '
+                                    'Please re-run the dry-run scan and review the CSV again.'.format(
+                                        record_id, row['field_path'],
+                                        row.get('original_value'), current_val,
+                                    )
+                                )
                         clean_fix = get_clean_fix(row['suggested_fix'])
                         data = set_nested_value(data, row['field_path'], clean_fix)
                         has_changes = True
@@ -390,7 +457,12 @@ def apply_fixes(rows, include_unrecoverable, dry_run=False):
                         fixable_count += 1
                         logger.info('   Fixed: [UserExtendedData] id={} path={}'.format(record_id, row['field_path']))
                     else:
-                        result['fix_applied'] = 'NO (UNRECOVERABLE)'
+                        if is_from_csv:
+                            result['fix_applied'] = row.get('fix_applied', '')
+                        elif row.get('fix_status') == 'partial':
+                            result['fix_applied'] = 'NO (PARTIAL - REVIEW NEEDED)'
+                        else:
+                            result['fix_applied'] = 'NO (UNRECOVERABLE)'
                         result['value_after'] = row['original_value']
                         skipped_count += 1
 
@@ -406,25 +478,35 @@ def apply_fixes(rows, include_unrecoverable, dry_run=False):
                 try:
                     user = OSFUser.objects.get(id=user_id)
                 except OSFUser.DoesNotExist:
-                    for row in u_rows:
-                        result = dict(row)
-                        result['fix_applied'] = 'NO (RECORD NOT EXIST)'
-                        result['value_after'] = row['original_value']
-                        result_rows.append(result)
-                        skipped_count += 1
-                        logger.warning('   Skipped: [OSFUser] id={} field={} (record not exist)'.format(user_id, row['field_path']))
-                    continue
+                    raise ValueError('OSFUser id={} not found in DB'.format(user_id))
 
                 has_changes = False
+                changed_json_cols = set()
 
                 for row in u_rows:
                     result = dict(row)
                     result['fix_applied'] = ''
                     result['value_after'] = ''
 
-                    should_fix = is_fixable(row, include_unrecoverable)
+                    if is_from_csv:
+                        should_fix = row.get('fix_applied', '').strip().upper() == 'YES'
+                    else:
+                        # Only auto-apply clean fixes. 'partial' results lost bytes
+                        # and must be reviewed by a human before applying.
+                        should_fix = row.get('fix_status') == 'clean'
 
                     if should_fix:
+                        if is_from_csv:
+                            current_val = _get_osfuser_field_value(user, row['field_path'])
+                            if current_val != row.get('original_value', ''):
+                                raise ValueError(
+                                    'OSFUser id={} field={}: original_value in CSV {!r} does not match '
+                                    'current DB value {!r}. DB may have been updated after export. '
+                                    'Please re-run the dry-run scan and review the CSV again.'.format(
+                                        user_id, row['field_path'],
+                                        row.get('original_value'), current_val,
+                                    )
+                                )
                         clean_fix = get_clean_fix(row['suggested_fix'])
                         path = row['field_path']
                         top_col = path.split('.')[0].split('[')[0]
@@ -437,6 +519,9 @@ def apply_fixes(rows, include_unrecoverable, dry_run=False):
                             fixable_count += 1
                             logger.info('   Fixed: [OSFUser] id={} field={}'.format(user_id, path))
                         elif top_col in OSFUSER_JSON_COLUMNS:
+                            # NULL/empty JSON column is a VALID state, not a defect;
+                            # default to an empty container so set_nested_value can
+                            # build the path to update.
                             col_data = getattr(user, top_col, None) or (dict() if top_col == 'social' else list())
                             sub_path = path[len(top_col):]
                             if sub_path.startswith('.'):
@@ -445,18 +530,40 @@ def apply_fixes(rows, include_unrecoverable, dry_run=False):
                             col_data = set_nested_value(col_data, sub_path, clean_fix)
                             setattr(user, top_col, copy.deepcopy(col_data))
                             has_changes = True
+                            changed_json_cols.add(top_col)
                             result['fix_applied'] = 'YES'
                             result['value_after'] = clean_fix
                             fixable_count += 1
                             logger.info('   Fixed: [OSFUser] id={} field={}'.format(user_id, path))
                     else:
-                        result['fix_applied'] = 'NO (UNRECOVERABLE)'
+                        if is_from_csv:
+                            result['fix_applied'] = row.get('fix_applied', '')
+                        elif row.get('fix_status') == 'partial':
+                            result['fix_applied'] = 'NO (PARTIAL - REVIEW NEEDED)'
+                        else:
+                            result['fix_applied'] = 'NO (UNRECOVERABLE)'
                         result['value_after'] = row['original_value']
                         skipped_count += 1
 
                     result_rows.append(result)
 
                 if has_changes:
+                    # Validate only the JSON column(s) we modified, using the field's own
+                    # validators (validate_social / validate_history_item). We deliberately
+                    # avoid full_clean() so unrelated/legacy fields cannot block a legitimate
+                    # fix and to skip uniqueness DB queries. Note: these validators check the
+                    # whole column value, so pre-existing invalid data in the same column will
+                    # also trigger a fail-fast here.
+                    for col in changed_json_cols:
+                        try:
+                            OSFUser._meta.get_field(col).run_validators(getattr(user, col))
+                        except DjangoValidationError as e:
+                            raise ValueError(
+                                'OSFUser id={} column "{}" failed model validation after applying the fix: {}. '
+                                'The column may contain other invalid/legacy data; please review the record.'.format(
+                                    user_id, col, '; '.join(e.messages),
+                                )
+                            )
                     user.save()
 
             if dry_run:
@@ -492,16 +599,21 @@ def main():
         default=default_output,
         help=f'Output CSV file path (default: {default_output})',
     )
-    parser.add_argument(
-        '--include-unrecoverable', action='store_true', default=False,
-        help='Also apply fixes for records marked as unrecoverable/partial result.',
-    )
     parser.add_argument(
         '--dry', action='store_true', default=False,
         help='Run script in dry-run mode (do not commit updates to the database).',
     )
+    parser.add_argument(
+        '--input-csv', '-i',
+        default=None,
+        help='Input CSV file generated in a dry run to apply specific fixes from.',
+    )
+
     args = parser.parse_args()
 
+    if not args.dry and not args.input_csv:
+        parser.error('the following arguments are required: --input-csv/-i (unless running in dry run mode with --dry)')
+
     if not args.dry:
         # For actual runs modifying data, set up file logging
         script_utils.add_file_logger(logger, __file__)
@@ -512,20 +624,195 @@ def main():
         format='%(asctime)s %(levelname)s %(message)s',
     )
 
-    # 1. Scan errors from database
+    # 1. Scan errors or load from CSV
     rows = []
-    rows.extend(scan_userextendeddata())
-    rows.extend(scan_osfuser())
+    is_from_csv = False
+    if args.input_csv:
+        logger.info('Loading records to fix from CSV file: {}'.format(args.input_csv))
+        if not os.path.exists(args.input_csv):
+            print('Error: Input CSV file "{}" does not exist.'.format(args.input_csv))
+            sys.exit(1)
+
+        try:
+            with open(args.input_csv, mode='r', encoding='utf-8-sig') as f:
+                # restkey/restval let us detect rows whose column count differs
+                # from the header: extra fields land in '__overflow__', missing
+                # trailing fields are filled with None.
+                reader = csv.DictReader(f, restkey='__overflow__', restval=None)
+
+                # Check headers: must match the exported schema exactly
+                # (correct column names, no missing column, no extra column).
+                if not reader.fieldnames:
+                    print('Error: Input CSV has no header row.')
+                    sys.exit(1)
+                expected_headers = set(RESULT_FIELDNAMES)
+                actual_headers = set(reader.fieldnames)
+                missing_cols = expected_headers - actual_headers
+                extra_cols = actual_headers - expected_headers
+                if missing_cols or extra_cols:
+                    msg = 'Error: Input CSV header does not match the expected schema ({} columns).'.format(len(RESULT_FIELDNAMES))
+                    if missing_cols:
+                        msg += ' Missing column(s): {}.'.format(', '.join(sorted(missing_cols)))
+                    if extra_cols:
+                        msg += ' Unexpected column(s): {}.'.format(', '.join(sorted(extra_cols)))
+                    print(msg)
+                    sys.exit(1)
+
+                try:
+                    for line_no, row in enumerate(reader, start=2):
+                        if not row:
+                            continue
+
+                        # Validate column count on this data row (no missing/extra fields)
+                        if row.get('__overflow__') is not None:
+                            print('Error: Row {} is invalid: it has more columns than the header.'.format(line_no))
+                            sys.exit(1)
+                        if any(value is None for key, value in row.items() if key != '__overflow__'):
+                            print('Error: Row {} is invalid: it has fewer columns than the header.'.format(line_no))
+                            sys.exit(1)
+
+                        # Validate source_table
+                        source_table = row.get('source_table')
+                        if not source_table:
+                            print('Error: Row {} is invalid: missing "source_table".'.format(line_no))
+                            sys.exit(1)
+                        source_table = source_table.strip()
+                        if source_table not in ('osf_userextendeddata', 'osf_osfuser'):
+                            print('Error: Row {} is invalid: "source_table" "{}" is not a recognised value.'.format(line_no, source_table))
+                            sys.exit(1)
+
+                        # Validate id
+                        id_val = row.get('id')
+                        if not id_val:
+                            print('Error: Row {} is invalid: missing "id".'.format(line_no))
+                            sys.exit(1)
+                        try:
+                            record_id = int(id_val.strip())
+                        except ValueError:
+                            print('Error: Row {} is invalid: "id" "{}" is not a valid integer.'.format(line_no, id_val))
+                            sys.exit(1)
+
+                        # Validate field_path
+                        field_path = row.get('field_path')
+                        if not field_path:
+                            print('Error: Row {} is invalid: missing "field_path".'.format(line_no))
+                            sys.exit(1)
+                        field_path = field_path.strip()
+
+                        # Validate required identity columns. These echo the exported
+                        # record and must not be blank (the customer asked for an
+                        # explicit empty-check on required fields such as user._id).
+                        user_guid = (row.get('user_guid') or '').strip()
+                        if not user_guid:
+                            print('Error: Row {} is invalid: missing "user_guid".'.format(line_no))
+                            sys.exit(1)
+                        user_id = (row.get('user_id') or '').strip()
+                        if not user_id:
+                            print('Error: Row {} is invalid: missing "user_id".'.format(line_no))
+                            sys.exit(1)
+
+                        # Validate suggested_fix if fix_applied is YES
+                        fix_applied = (row.get('fix_applied') or '').strip()
+                        suggested_fix = row.get('suggested_fix') or ''
+                        if fix_applied.upper() == 'YES' and not suggested_fix:
+                            print('Error: Row {} is invalid: "fix_applied" is "YES" but "suggested_fix" is empty.'.format(line_no))
+                            sys.exit(1)
+                        # Refuse to apply a value that still carries an UNRECOVERABLE marker,
+                        # otherwise the literal marker text would be written into the DB.
+                        if fix_applied.upper() == 'YES' and '[UNRECOVERABLE:' in suggested_fix:
+                            print(
+                                'Error: Row {} is invalid: "fix_applied" is "YES" but "suggested_fix" still '
+                                'contains an UNRECOVERABLE marker ({!r}). Set "fix_applied" to NO, or replace '
+                                '"suggested_fix" with the corrected value (remove the marker text).'.format(
+                                    line_no, suggested_fix,
+                                )
+                            )
+                            sys.exit(1)
+
+                        # Refuse to apply a value that still contains the Unicode
+                        # replacement character (U+FFFD). It signals a partial decode
+                        # that lost bytes; Django model constraints accept it on save(),
+                        # so we must fail fast here instead of writing corrupted data.
+                        if fix_applied.upper() == 'YES' and REPLACEMENT_CHAR in get_clean_fix(suggested_fix):
+                            print(
+                                'Error: Row {} is invalid: "fix_applied" is "YES" but "suggested_fix" still '
+                                'contains the Unicode replacement character (U+FFFD "{}"), which means the value '
+                                'was not fully recovered. Set "fix_applied" to NO, or replace "suggested_fix" '
+                                'with a fully corrected value.'.format(line_no, REPLACEMENT_CHAR)
+                            )
+                            sys.exit(1)
+
+                        # When this row will be applied, validate the target column and the
+                        # corrected value itself (column existence / length / residual issues).
+                        if fix_applied.upper() == 'YES':
+                            clean_fix = get_clean_fix(suggested_fix)
+                            top_col = field_path.split('.')[0].split('[')[0]
+
+                            if source_table == 'osf_osfuser':
+                                if top_col not in OSFUSER_STRING_COLUMNS and top_col not in OSFUSER_JSON_COLUMNS:
+                                    print('Error: Row {} is invalid: "field_path" top column "{}" is not a known OSFUser column.'.format(line_no, top_col))
+                                    sys.exit(1)
+                                # CharField columns are constrained by the DB (varchar(max_length)).
+                                # Check it early here so we fail fast with a clear message instead
+                                # of a low-level DataError mid-transaction.
+                                if top_col in OSFUSER_STRING_COLUMNS:
+                                    max_len = OSFUser._meta.get_field(top_col).max_length
+                                    if max_len is not None and len(clean_fix) > max_len:
+                                        print('Error: Row {} is invalid: "suggested_fix" length {} exceeds the max_length {} of column "{}".'.format(
+                                            line_no, len(clean_fix), max_len, top_col))
+                                        sys.exit(1)
+
+                            # Warn (do not stop) if the corrected value still appears to contain
+                            # encoding / HTML-entity issues. This is heuristic: accented Latin
+                            # letters can trigger a false positive, so a human should confirm.
+                            residual_issues = detect_issues(clean_fix)
+                            if residual_issues:
+                                logger.warning(
+                                    'Row %s: "suggested_fix" still appears to contain issue(s) [%s] after the fix: %r. '
+                                    'Please double-check this is a legitimate value before applying.',
+                                    line_no, ','.join(residual_issues), clean_fix,
+                                )
+
+                        rows.append({
+                            'source_table': source_table,
+                            'id': record_id,
+                            'user_guid': user_guid,
+                            'user_id': user_id,
+                            'field_path': field_path,
+                            'issues': (row.get('issues') or '').strip(),
+                            'original_value': row.get('original_value') or '',
+                            'suggested_fix': suggested_fix,
+                            'fix_applied': fix_applied,
+                            'fix_status': compute_fix_status(suggested_fix),
+                        })
+                except csv.Error as e:
+                    print('Error: Failed to parse CSV file "{}" due to formatting error: {}'.format(args.input_csv, e))
+                    sys.exit(1)
+        except Exception:
+            # Fail fast: log the full traceback and re-raise so the unexpected
+            # error is not swallowed. (sys.exit(1) above raises SystemExit, which
+            # is not an Exception subclass, so the explicit validation exits are
+            # unaffected by this handler.)
+            logger.exception('Unexpected error while opening/reading CSV file "%s".', args.input_csv)
+            raise
+
+        is_from_csv = True
+        logger.info('Loaded {} records from CSV.'.format(len(rows)))
+    else:
+        # Default: scan database
+        rows.extend(scan_userextendeddata())
+        rows.extend(scan_osfuser())
 
     if not rows:
         print('No encoding errors found.')
         return
 
-    # Print the table of found errors
-    print_errors(rows)
+    # Print the table of found/loaded errors
+    if rows:
+        print_errors(rows)
 
     # 2. Apply fixes (conditionally committing or rolling back)
-    result_rows = apply_fixes(rows, args.include_unrecoverable, dry_run=args.dry)
+    result_rows = apply_fixes(rows, dry_run=args.dry, is_from_csv=is_from_csv)
 
     # 3. Export results to CSV
     save_result_csv(result_rows, args.output)