diff --git a/.issues/106/issue.go b/.issues/106/issue.go new file mode 100644 index 0000000..bab2fc8 --- /dev/null +++ b/.issues/106/issue.go @@ -0,0 +1,61 @@ +/* + * Link: https://github.com/yeqown/go-qrcode/issues/106 + * Title: Feature: Add Kanji encoding mode support + * Author: fdelbos(https://github.com/fdelbos) + */ + +package main + +import ( + "fmt" + + yeqown "github.com/yeqown/go-qrcode/v2" + "github.com/yeqown/go-qrcode/writer/compressed" +) + +/* +See https://github.com/yeqown/go-qrcode/issues/106 +Feature: Add Kanji encoding mode support for QR codes + +Results: +Content length: 3 // source text length (3 Kanji characters) +qr-kanji.png: bytes +*/ +func main() { + // Pure Kanji text with explicit Kanji mode + // But Shift-JIS only supports Kanji characters, not full-width alphanumeric, + // so we can't encode `https://google.com` in Kanji mode + content := "日本語" + + fmt.Printf("Content length: %d\n", len([]rune(content))) + + qrc, err := yeqown.NewWith(content, + yeqown.WithEncodingMode(yeqown.EncModeKanji), + ) + if err != nil { + fmt.Printf("ERROR: %v\n", err) + return + } + + // Save to file + option := &compressed.Option{ + Padding: 4, + BlockSize: 1, + } + w, err := compressed.New("qr-kanji.png", option) + if err != nil { + panic(err) + } + defer w.Close() + + if err = qrc.Save(w); err != nil { + panic(err) + } + + fmt.Println("QR code saved to qr-kanji.png") + + // Note: If your input might contain non-Kanji characters, use EncModeAuto: + // qrc, err := yeqown.NewWith(anyText, + // yeqown.WithEncodingMode(yeqown.EncModeAuto), + // ) +} diff --git a/.issues/issue69/issue69.go b/.issues/69/issue.go similarity index 100% rename from .issues/issue69/issue69.go rename to .issues/69/issue.go diff --git a/README.md b/README.md index bacfd77..2ac3d6f 100644 --- a/README.md +++ b/README.md @@ -70,8 +70,8 @@ const ( EncModeAlphanumeric // EncModeByte mode ... EncModeByte - // EncModeJP mode ... - EncModeJP + // EncModeKanji mode ... + EncModeKanji ) // WithEncodingMode sets the encoding mode. diff --git a/chardet.go b/chardet.go new file mode 100644 index 0000000..8035b9f --- /dev/null +++ b/chardet.go @@ -0,0 +1,146 @@ +package qrcode + +import ( + "errors" + + "golang.org/x/text/encoding/japanese" + "golang.org/x/text/transform" +) + +var ( + ErrNotSupportCharacter = errors.New("character set not supported, please check your input data") +) + +// chardet.go refer to https://github.com/chardet/chardet to detect input string's +// character set, to see any unsupported character encountered in the input string. + +// analyzeEncFunc returns true is current byte matched in current mode, +// otherwise means you should use a bigger character set to check. +type analyzeEncFunc func(rune) bool + +// analyzeEncodeModeFromRaw try to detect letter set of input data, +// so that encoder can determine which mode should be use. +// reference: https://en.wikipedia.org/wiki/QR_code +// +// case1: only numbers, use EncModeNumeric. +// case2: could not use EncModeNumeric, but can find them all in character mapping, use EncModeAlphanumeric. +// case3: could not use EncModeAlphanumeric, but can find them all Shift JIS character set, use EncModeKanji. +// case4: could not use EncModeKanji, use EncModeByte. +func analyzeEncodeModeFromRaw(raw string) (encMode, error) { + var ( + analyzeFn analyzeEncFunc + mode = EncModeNone + ) + + getNextAnalyzeFn := func() analyzeEncFunc { + switch mode { + case EncModeNumeric: + return analyzeNum + case EncModeAlphanumeric: + return analyzeAlphaNum + case EncModeKanji: + return analyzeJP + case EncModeByte: + return analyzeByte + default: + } + + return nil + } + + next := func() bool { + // switch to next mode and get next analyze function. if no more analyze function, return true. + mode <<= 1 + analyzeFn = getNextAnalyzeFn() + return analyzeFn == nil + } + + next() + + // Loop to check each character in raw data, + // from low mode to higher while current mode could bear the input data. + for _, r := range raw { + reAnalyze: + // issue#28 @borislavone reports this bug. + // FIXED(@yeqown): next encMode analyzeVersionAuto func did not check the previous byte, + // add goto statement to reanalyze previous byte which can't be analyzed in last encMode. + if pass := analyzeFn(r); pass { + continue + } + + if nomore := next(); nomore { + break + } + + goto reAnalyze + } + + if mode > EncModeByte { + // If the mode overflow the EncModeKanji, means we can't encode the input data. + return EncModeNone, ErrNotSupportCharacter + } + + return mode, nil +} + +// analyzeNum is r in num encMode +func analyzeNum(r rune) bool { + return r >= '0' && r <= '9' +} + +// analyzeAlphaNum is r in alpha number +func analyzeAlphaNum(r rune) bool { + if (r >= '0' && r <= '9') || (r >= 'A' && r <= 'Z') { + return true + } + switch r { + case ' ', '$', '%', '*', '+', '-', '.', '/', ':': + return true + } + return false +} + +// analyzeByte always return true, since byte (utf8) mode can encode all characters. +func analyzeByte(r rune) bool { + return true +} + +// analyzeJP checks if a character can be encoded in QR Code Kanji mode. +// A character is valid for Kanji mode if: +// 1. It is in the CJK Unified Ideographs block (U+4E00-U+9FFF) +// 2. It can be converted to Shift JIS +// 3. The resulting Shift JIS value is in the valid QR Code ranges: +// - 0x8140-0x9FFC (first range) +// - 0xE040-0xEBBF (second range) +func analyzeJP(r rune) bool { + // Check if the character is in the CJK Unified Ideographs block + // This is a quick pre-check to avoid unnecessary conversion attempts + // U+4E00-U+9FFF: CJK Unified Ideographs + // U+3400-U+4DBF: CJK Unified Ideographs Extension A + // U+F900-U+FAFF: CJK Compatibility Ideographs + isCJK := (r >= 0x4E00 && r <= 0x9FFF) || + (r >= 0x3400 && r <= 0x4DBF) || + (r >= 0xF900 && r <= 0xFAFF) + + if !isCJK { + return false + } + + // Try to convert the character to Shift JIS + // If conversion fails, it's not a valid Kanji character for QR Code + enc := japanese.ShiftJIS.NewEncoder() + s2, _, err := transform.String(enc, string(r)) + if err != nil || len(s2) != 2 { + return false + } + + // Check if the resulting Shift JIS value is in the valid QR Code Kanji ranges + data := []byte(s2) + hi := uint16(data[0]) + lo := uint16(data[1]) + code := hi<<8 | lo + + // QR Code Kanji mode supports Shift JIS ranges: + // 0x8140-0x9FFC and 0xE040-0xEBBF + return (code >= 0x8140 && code <= 0x9FFC) || (code >= 0xE040 && code <= 0xEBBF) +} diff --git a/chardet_test.go b/chardet_test.go new file mode 100644 index 0000000..7774c07 --- /dev/null +++ b/chardet_test.go @@ -0,0 +1,399 @@ +package qrcode + +import ( + "testing" +) + +func Test_analyzeNum(t *testing.T) { + type args struct { + byt rune + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "case 0", + args: args{byt: '0'}, + want: true, + }, + { + name: "case 1", + args: args{byt: 'a'}, + want: false, + }, + { + name: "case 2", + args: args{byt: 'A'}, + want: false, + }, + { + name: "case 3", + args: args{byt: '9'}, + want: true, + }, + { + name: "case 4", + args: args{byt: '*'}, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := analyzeNum(tt.args.byt); got != tt.want { + t.Errorf("analyzeNum() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_analyzeAlphanum(t *testing.T) { + type args struct { + byt rune + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "case 0", + args: args{byt: '0'}, + want: true, + }, + { + name: "case 1", + args: args{byt: 'a'}, + want: false, + }, + { + name: "case 2", + args: args{byt: 'A'}, + want: true, + }, + { + name: "case 3", + args: args{byt: '9'}, + want: true, + }, + { + name: "case 4", + args: args{byt: '*'}, + want: true, + }, + { + name: "case 5", + args: args{byt: '?'}, + want: false, + }, + { + name: "case 6", + args: args{byt: '&'}, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := analyzeAlphaNum(tt.args.byt); got != tt.want { + t.Errorf("analyzeAlphaNum() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_analyzeByte(t *testing.T) { + type args struct { + byt rune + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "case 0", + args: args{byt: '0'}, + want: true, + }, + { + name: "case 1", + args: args{byt: 'a'}, + want: true, + }, + { + name: "case 2", + args: args{byt: 'A'}, + want: true, + }, + { + name: "case 3", + args: args{byt: '9'}, + want: true, + }, + { + name: "case 4", + args: args{byt: '*'}, + want: true, + }, + { + name: "case 5", + args: args{byt: '?'}, + want: true, + }, + { + name: "case 6", + args: args{byt: '&'}, + want: true, + }, + { + name: "case 7", + args: args{byt: 'Ö'}, + want: true, + }, + { + name: "case 8", + args: args{byt: 'に'}, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := analyzeByte(tt.args.byt); got != tt.want { + t.Errorf("analyzeByte() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_analyzeJP(t *testing.T) { + type args struct { + r rune + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "case 0", + args: args{r: '0'}, + want: false, + }, + { + name: "case 1", + args: args{r: 'a'}, + want: false, + }, + { + name: "case 2", + args: args{r: 'A'}, + want: false, + }, + { + name: "case 3", + args: args{r: '9'}, + want: false, + }, + { + name: "case 4", + args: args{r: '*'}, + want: false, + }, + { + name: "case 5", + args: args{r: '?'}, + want: false, + }, + { + name: "case 6", + args: args{r: '&'}, + want: false, + }, + { + name: "case 7", + args: args{r: 'Ö'}, + want: false, + }, + { + name: "case 8", + args: args{r: 'に'}, + want: false, // Hiragana is NOT supported in Kanji mode + }, + { + name: "case 9", + args: args{r: '茗'}, + want: true, + }, + { + name: "case 10", + args: args{r: '杆'}, + want: true, + }, + { + name: "case 11", + args: args{r: '荷'}, + want: true, + }, + { + name: "case 12", + args: args{r: '杠'}, + want: true, + }, + { + name: "case 13", + args: args{r: '杙'}, + want: true, + }, + { + name: "case 14", + args: args{r: '杣'}, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := analyzeJP(tt.args.r); got != tt.want { + t.Errorf("analyzeJP(%c=0x%x) = %v, want %v", tt.args.r, tt.args.r, got, tt.want) + } + }) + } +} + +func Test_analyzeMode(t *testing.T) { + type args struct { + raw string + } + tests := []struct { + name string + args args + want encMode + wantErr bool + }{ + { + name: "case 0", + args: args{raw: "123120899231"}, + want: EncModeNumeric, + wantErr: false, + }, + { + name: "case 1", + args: args{raw: ":/1231H208*99231FBJO"}, + want: EncModeAlphanumeric, + wantErr: false, + }, + { + name: "case 2", + args: args{raw: "hahah1298312hG&^FBJO@jhgG*"}, + want: EncModeByte, + }, + { + name: "case 3", + args: args{raw: "JKAHDOIANKQOIHCMJKASJ"}, + want: EncModeAlphanumeric, + wantErr: false, + }, + { + name: "case 4", + args: args{raw: "https://baidu.com?keyword=_JSO==GA"}, + want: EncModeByte, + wantErr: false, + }, + { + name: "case 5", + args: args{raw: "茗荷"}, + want: EncModeKanji, + wantErr: false, + }, + { + name: "case 6 (swedish letter)", + args: args{raw: "Övrigt aksldjlk Övrigt should JP encMode?"}, + want: EncModeByte, + }, + { + name: "case 7 (japanese letter)", + args: args{raw: "嵋嶄"}, + want: EncModeKanji, + wantErr: false, + }, + { + name: "issue#28 alphanum mode does not support lower case letter", + args: args{raw: "a"}, + want: EncModeByte, + wantErr: false, + }, + { + name: "hiragana only - should use Byte mode", + args: args{raw: "これはひらがなです"}, + want: EncModeByte, + wantErr: false, + }, + { + name: "katakana only - should use Byte mode", + args: args{raw: "コンニチハ"}, + want: EncModeByte, + wantErr: false, + }, + { + name: "mixed kanji and hiragana - should use Byte mode", + args: args{raw: "漢字ひらがな"}, + want: EncModeByte, + wantErr: false, + }, + { + name: "single kanji character", + args: args{raw: "漢"}, + want: EncModeKanji, + wantErr: false, + }, + { + name: "kanji sentence", + args: args{raw: "日本語"}, + want: EncModeKanji, + wantErr: false, + }, + { + name: "kanji world characters", + args: args{raw: "世界"}, + want: EncModeKanji, + wantErr: false, + }, + { + name: "long kanji text", + args: args{raw: "東京京都大阪北海道沖縄鹿児島"}, + want: EncModeKanji, + wantErr: false, + }, + { + name: "mixed kanji and alphanumeric - should use Byte mode", + args: args{raw: "漢字ABC123"}, + want: EncModeByte, + wantErr: false, + }, + { + name: "CJK Extension A character - not in Shift JIS, should use Byte mode", + args: args{raw: "㐀"}, // U+3400, CJK Extension A - not in Shift JIS + want: EncModeByte, + wantErr: false, + }, + { + name: "CJK Compatibility Ideograph - not in Shift JIS, should use Byte mode", + args: args{raw: "豈"}, // U+F900, CJK Compatibility Ideographs - not in Shift JIS + want: EncModeByte, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := analyzeEncodeModeFromRaw(tt.args.raw) + if (err != nil) != tt.wantErr { + t.Errorf("analyzeMode() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if got != tt.want { + t.Errorf("analyzeMode() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/cmd/wasm/types.go b/cmd/wasm/types.go index f81aae3..7c9783a 100644 --- a/cmd/wasm/types.go +++ b/cmd/wasm/types.go @@ -89,8 +89,8 @@ func (o *genOption) encodeOptions() []qrcode.EncodeOption { out = append(out, qrcode.WithEncodingMode(qrcode.EncModeNumeric)) case uint8(qrcode.EncModeByte): out = append(out, qrcode.WithEncodingMode(qrcode.EncModeByte)) - case uint8(qrcode.EncModeJP): - out = append(out, qrcode.WithEncodingMode(qrcode.EncModeJP)) + case uint8(qrcode.EncModeKanji): + out = append(out, qrcode.WithEncodingMode(qrcode.EncModeKanji)) } switch o.encodeOption.ecLevel { diff --git a/docs/kanji-encoding.md b/docs/kanji-encoding.md new file mode 100644 index 0000000..80757c4 --- /dev/null +++ b/docs/kanji-encoding.md @@ -0,0 +1,309 @@ +# Kanji Encoding Mode + +## Overview + +Kanji mode is a specialized encoding mode in QR Code designed to efficiently encode Japanese Kanji characters. It provides significant space savings compared to byte mode by leveraging the structure of Shift JIS (Japanese Industrial Standards) character encoding. + +### Benefits + +- **Compact Encoding**: Each Kanji character is encoded in 13 bits (vs. 16 bits in UTF-16 or 8 bytes per character in UTF-8) +- **Efficient Storage**: Reduces QR Code size for Japanese text by approximately 50% compared to byte mode +- **Optimized for Japanese**: Specifically designed for the Japanese writing system + +## Specifications + +| Parameter | Value | +|-----------|-------| +| Mode Indicator | `1000` (4 bits) | +| Bits per Character | 13 bits | +| Character Encoding | Shift JIS (JIS X 0208) | + +### Character Count Indicator Bits + +The number of bits used to store the character count varies by QR Code version: + +| QR Code Version | Character Count Bits | +|-----------------|---------------------| +| 1-9 | 8 bits | +| 10-26 | 10 bits | +| 27-40 | 12 bits | + +## Character Set + +Kanji mode supports characters from the JIS X 0208 character set, encoded using Shift JIS. The valid Kanji characters fall within two ranges: + +### Shift JIS Ranges + +| Range | Start | End | Description | +|-------|-------|-----|-------------| +| Range 1 | 0x8140 | 0x9FFC | First Kanji block | +| Range 2 | 0xE040 | 0xEBBF | Second Kanji block | + +### Unicode to Shift JIS Mapping + +Modern applications typically work with Unicode characters. To use Kanji mode, Unicode characters must first be converted to their Shift JIS byte representation. + +Example mappings: +- Unicode `U+4E16` (世) → Shift JIS `0x90 0xB6` +- Unicode `U+754C` (界) → Shift JIS `0x8A 0x79` + +## Encoding Algorithm + +### Step-by-Step Process + +1. **Input**: Unicode Kanji character(s) +2. **Convert**: Transform each Unicode character to its Shift JIS 2-byte representation +3. **Adjust**: Apply the adjustment formula based on the Shift JIS value +4. **Encode**: Compress to 13-bit representation + +### Mathematical Formula + +Given a Shift JIS code `code` (2 bytes): + +``` +// Step 1: Adjust the base +if (code >= 0x8140 && code <= 0x9FFC) { + adjusted = code - 0x8140 +} else if (code >= 0xE040 && code <= 0xEBBF) { + adjusted = code - 0xC140 +} + +// Step 2: Split into high and low bytes +high = adjusted >> 8 // Upper byte +low = adjusted & 0xFF // Lower byte + +// Step 3: Calculate encoded value (13-bit result) +encoded = (high × 0xC0) + low +``` + +### Why 0xC0? + +The multiplier `0xC0` (192 in decimal) is derived from the Shift JIS encoding structure: +- In the valid ranges, the lower byte can be `0x40-0xFC` (except `0x7F`) +- This gives 188 possible values per high byte +- The encoding packs these efficiently: `high × 192 + low` results in at most 13 bits + +### Encoding Example + +Let's encode the Kanji character "世" (Unicode `U+4E16`): + +1. **Convert to Shift JIS**: `0x90B6` +2. **Check range**: `0x90B6` is in range 1 (0x8140-0x9FFC) +3. **Adjust**: `0x90B6 - 0x8140 = 0x0F76` +4. **Split**: `high = 0x0F`, `low = 0x76` +5. **Encode**: `(0x0F × 0xC0) + 0x76 = 0x1176` +6. **Binary**: `1000101110110` (13 bits) + +## Character Detection + +To determine if a character is eligible for Kanji mode encoding: + +### Detection Algorithm + +1. **Check if character is Kanji**: The character must be in the Japanese Kanji Unicode ranges (primarily U+4E00-U+9FFF for CJK Unified Ideographs) +2. **Convert to Shift JIS**: Attempt conversion from Unicode to Shift JIS +3. **Validate range**: The resulting Shift JIS value must be in: + - `0x8140` to `0x9FFC`, OR + - `0xE040` to `0xEBBF` +4. **Check byte length**: Each character must encode to exactly 2 bytes in Shift JIS + +### Detection Criteria Summary + +``` +IsKanji(character) { + shiftJIS = UnicodeToShiftJIS(character) + + if (shiftJIS.length != 2) { + return false + } + + code = (shiftJIS[0] << 8) | shiftJIS[1] + + return (code >= 0x8140 && code <= 0x9FFC) || + (code >= 0xE040 && code <= 0xEBBF) +} +``` + +### Automatic Mode Selection + +When encoding mixed content, use Kanji mode only when: +- ALL characters in the data are valid Kanji characters +- Each character successfully converts to a valid Shift JIS value in the allowed ranges +- The content is primarily Japanese text + +If any character fails validation, fall back to a compatible mode (typically byte mode with UTF-8). + +## Practical Considerations + +### When to Use Kanji Mode + +- Japanese text containing primarily Kanji characters +- When minimizing QR Code size is critical +- When target scanners support Kanji mode decoding + +### Limitations + +- Only supports JIS X 0208 characters +- Hiragana and Katakana are NOT supported (use byte mode) +- Some rare Kanji characters outside the ranges cannot be encoded +- Requires proper Shift JIS conversion capability + +### Compatibility + +Most modern QR Code scanners support Kanji mode, but for maximum compatibility with older scanners, consider using byte mode with UTF-8 encoding, especially for international applications. + +--- + +# Kanji 编码模式 + +## 概述 + +Kanji 模式是 QR 码中专门设计的一种编码模式,用于高效编码日文汉字字符。通过利用 Shift JIS(日本工业标准)字符编码的结构,它相比字节模式能显著节省空间。 + +### 优势 + +- **紧凑编码**: 每个汉字字符编码为 13 位(相比 UTF-16 的 16 位或 UTF-8 的每字符 8 字节) +- **高效存储**: 相比字节模式,可将日文文本的 QR 码大小减少约 50% +- **专为日文优化**: 专门针对日文字符系统设计 + +## 规范说明 + +| 参数 | 值 | +|------|-----| +| 模式指示器 | `1000` (4 位) | +| 每字符位数 | 13 位 | +| 字符编码 | Shift JIS (JIS X 0208) | + +### 字符计数指示器位数 + +用于存储字符计数的位数随 QR 码版本变化: + +| QR 码版本 | 字符计数位数 | +|-----------|-------------| +| 1-9 | 8 位 | +| 10-26 | 10 位 | +| 27-40 | 12 位 | + +## 字符集 + +Kanji 模式支持 JIS X 0208 字符集中的字符,使用 Shift JIS 编码。有效的汉字字符落在两个范围内: + +### Shift JIS 范围 + +| 范围 | 起始 | 结束 | 描述 | +|------|------|------|------| +| 范围 1 | 0x8140 | 0x9FFC | 第一汉字块 | +| 范围 2 | 0xE040 | 0xEBBF | 第二汉字块 | + +### Unicode 到 Shift JIS 映射 + +现代应用通常使用 Unicode 字符。要使用 Kanji 模式,Unicode 字符必须首先转换为其 Shift JIS 双字节表示。 + +映射示例: +- Unicode `U+4E16` (世) → Shift JIS `0x90 0xB6` +- Unicode `U+754C` (界) → Shift JIS `0x8A 0x79` + +## 编码算法 + +### 逐步流程 + +1. **输入**: Unicode 汉字字符 +2. **转换**: 将每个 Unicode 字符转换为其 Shift JIS 双字节表示 +3. **调整**: 根据 Shift JIS 值应用调整公式 +4. **编码**: 压缩为 13 位表示 + +### 数学公式 + +给定 Shift JIS 代码 `code`(2 字节): + +``` +// 步骤 1: 调整基数 +if (code >= 0x8140 && code <= 0x9FFC) { + adjusted = code - 0x8140 +} else if (code >= 0xE040 && code <= 0xEBBF) { + adjusted = code - 0xC140 +} + +// 步骤 2: 拆分为高位和低位字节 +high = adjusted >> 8 // 高位字节 +low = adjusted & 0xFF // 低位字节 + +// 步骤 3: 计算编码值(13 位结果) +encoded = (high × 0xC0) + low +``` + +### 为什么是 0xC0? + +乘数 `0xC0`(十进制 192)源自 Shift JIS 编码结构: +- 在有效范围内,低位字节可以是 `0x40-0xFC`(除 `0x7F` 外) +- 这为每个高位字节提供 188 个可能值 +- 编码将其高效打包:`high × 192 + low` 结果最多为 13 位 + +### 编码示例 + +让我们编码汉字 "世"(Unicode `U+4E16`): + +1. **转换为 Shift JIS**: `0x90B6` +2. **检查范围**: `0x90B6` 在范围 1 内 (0x8140-0x9FFC) +3. **调整**: `0x90B6 - 0x8140 = 0x0F76` +4. **拆分**: `high = 0x0F`, `low = 0x76` +5. **编码**: `(0x0F × 0xC0) + 0x76 = 0x1176` +6. **二进制**: `1000101110110` (13 位) + +## 字符检测 + +判断字符是否符合 Kanji 模式编码条件: + +### 检测算法 + +1. **检查是否为汉字**: 字符必须在日文汉字 Unicode 范围内(主要是 U+4E00-U+9FFF 的 CJK 统一表意文字) +2. **转换为 Shift JIS**: 尝试从 Unicode 转换为 Shift JIS +3. **验证范围**: 结果 Shift JIS 值必须在: + - `0x8140` 到 `0x9FFC`,或 + - `0xE040` 到 `0xEBBF` +4. **检查字节长度**: 每个字符在 Shift JIS 中必须编码为恰好 2 字节 + +### 检测标准总结 + +``` +IsKanji(character) { + shiftJIS = UnicodeToShiftJIS(character) + + if (shiftJIS.length != 2) { + return false + } + + code = (shiftJIS[0] << 8) | shiftJIS[1] + + return (code >= 0x8140 && code <= 0x9FFC) || + (code >= 0xE040 && code <= 0xEBBF) +} +``` + +### 自动模式选择 + +编码混合内容时,仅当满足以下条件时使用 Kanji 模式: +- 数据中的所有字符都是有效的汉字字符 +- 每个字符都能成功转换为允许范围内的有效 Shift JIS 值 +- 内容主要是日文文本 + +如果任何字符验证失败,则回退到兼容模式(通常为 UTF-8 字节模式)。 + +## 实际考虑 + +### 何时使用 Kanji 模式 + +- 主要包含汉字字符的日文文本 +- 最小化 QR 码大小至关重要时 +- 目标扫描器支持 Kanji 模式解码 + +### 限制 + +- 仅支持 JIS X 0208 字符 +- 不支持平假名和片假名(使用字节模式) +- 范围外的某些罕见汉字无法编码 +- 需要正确的 Shift JIS 转换能力 + +### 兼容性 + +大多数现代 QR 码扫描器支持 Kanji 模式,但为了与旧扫描器实现最大兼容性,对于国际应用可考虑使用带 UTF-8 编码的字节模式。 diff --git a/encoder.go b/encoder.go index 73803ba..9292b7c 100644 --- a/encoder.go +++ b/encoder.go @@ -5,26 +5,53 @@ package qrcode import ( "fmt" "log" + "strconv" + "unicode/utf8" "github.com/yeqown/reedsolomon/binary" + "golang.org/x/text/encoding/japanese" + "golang.org/x/text/transform" ) -// encMode ... +// encMode indicates the encoding mode of the data to be encoded. +// The encoding mode is used to determine how the data should be encoded +// into bits for the QR code. This repository supports the following encoding +// modes: +// - EncModeNone: no encoding +// - EncModeNumeric: numeric encoding +// - EncModeAlphanumeric: alphanumeric encoding +// - EncModeKanji: japanese kanji encoding +// - EncModeByte: byte encoding +// +// The encoding mode is determined by the data to be encoded. For example, if +// the data to be encoded is all numeric, the encoding mode will be EncModeNumeric. +// If the data to be encoded is alphanumeric, the encoding mode will be EncModeAlphanumeric. +// You can also specify the encoding mode automatically by using EncModeAuto, which +// will automatically determine the encoding mode based on the data to be encoded. type encMode uint const ( - // a qrbool of EncModeAuto will trigger a detection of the letter set from the input data, + // EncModeAuto will trigger a detection of the letter set from the input data. EncModeAuto = 0 - // EncModeNone mode ... - EncModeNone encMode = 1 << iota - // EncModeNumeric mode ... - EncModeNumeric - // EncModeAlphanumeric mode ... - EncModeAlphanumeric - // EncModeByte mode ... - EncModeByte + + // EncModeNone mode represents no encoding, usually used as initial value of encMode + EncModeNone encMode = 2 + + // EncModeNumeric mode support only numeric character set (0-9) + EncModeNumeric encMode = 4 + + // EncModeAlphanumeric mode support only alphanumeric character set (0-9, A-Z, SP, $%*+-./ or :) + EncModeAlphanumeric encMode = 8 + // EncModeJP mode ... - EncModeJP + // @Deprecated use EncModeKanji instead + EncModeJP encMode = 16 + // EncModeKanji mode support only Shift JIS encoding character set. + // From 0x8140 to 0x9FFC and 0xE040 to 0xEBBF. + EncModeKanji = EncModeJP + + // EncModeByte mode support ISO-8859-1 character set by default, but also support UTF-8. + EncModeByte encMode = 32 ) var ( @@ -41,12 +68,12 @@ func getEncModeName(mode encMode) string { return "numeric" case EncModeAlphanumeric: return "alphanumeric" + case EncModeKanji: + return "kanji" case EncModeByte: return "byte" - case EncModeJP: - return "japan" default: - return "unknown" + return "unknown(" + strconv.Itoa(int(mode)) + ")" } } @@ -59,7 +86,7 @@ func getEncodeModeIndicator(mode encMode) *binary.Binary { return binary.New(false, false, true, false) case EncModeByte: return binary.New(false, true, false, false) - case EncModeJP: + case EncModeKanji: return binary.New(true, false, false, false) default: panic("no indicator") @@ -69,8 +96,7 @@ func getEncodeModeIndicator(mode encMode) *binary.Binary { // encoder ... data to bit stream ... type encoder struct { // self init - dst *binary.Binary - data []byte // raw input data + dst *binary.Binary // initial params mode encMode // encode mode @@ -81,9 +107,14 @@ type encoder struct { } func newEncoder(m encMode, ec ecLevel, v version) *encoder { + switch m { + case EncModeNumeric, EncModeAlphanumeric, EncModeByte, EncModeKanji: + default: + panic("unsupported data encoding mode in newEncoder()") + } + return &encoder{ dst: nil, - data: nil, mode: m, ecLv: ec, version: v, @@ -93,26 +124,43 @@ func newEncoder(m encMode, ec ecLevel, v version) *encoder { // Encode ... // 1. encode raw data into bitset // 2. append _defaultPadding data -func (e *encoder) Encode(byts []byte) (*binary.Binary, error) { +func (e *encoder) Encode(raw string) (*binary.Binary, error) { e.dst = binary.New() - e.data = byts + + var ( + data []byte + charCount = 0 // Character count for the character count indicator + ) + switch e.mode { + case EncModeNumeric, EncModeAlphanumeric, EncModeByte: + data = []byte(raw) + charCount = len(data) + case EncModeKanji: + data = toShiftJIS(raw) + // For Kanji mode, charCount is the number of Kanji characters, not bytes + charCount = utf8.RuneCountInString(raw) + default: + log.Printf("unsupported encoding mode: %s", getEncModeName(e.mode)) + } // append mode indicator symbol indicator := getEncodeModeIndicator(e.mode) e.dst.Append(indicator) // append chars length counter bits symbol - e.dst.AppendUint32(uint32(len(byts)), e.charCountBits()) + e.dst.AppendUint32(uint32(charCount), e.charCountBits()) // encode data with specified mode switch e.mode { case EncModeNumeric: - e.encodeNumeric() + e.encodeNumeric(data) case EncModeAlphanumeric: - e.encodeAlphanumeric() + e.encodeAlphanumeric(data) + case EncModeKanji: + e.encodeKanji(data) case EncModeByte: - e.encodeByte() - case EncModeJP: - panic("this has not been finished") + e.encodeByte(data) + default: + log.Printf("unsupported encoding mode: %s", getEncModeName(e.mode)) } // fill and _defaultPadding bits @@ -124,20 +172,20 @@ func (e *encoder) Encode(byts []byte) (*binary.Binary, error) { } // 0001b mode indicator -func (e *encoder) encodeNumeric() { +func (e *encoder) encodeNumeric(data []byte) { if e.dst == nil { log.Println("e.dst is nil") return } - for i := 0; i < len(e.data); i += 3 { - charsRemaining := len(e.data) - i + for i := 0; i < len(data); i += 3 { + charsRemaining := len(data) - i var value uint32 bitsUsed := 1 for j := 0; j < charsRemaining && j < 3; j++ { value *= 10 - value += uint32(e.data[i+j] - 0x30) + value += uint32(data[i+j] - 0x30) bitsUsed += 3 } e.dst.AppendUint32(value, bitsUsed) @@ -145,18 +193,18 @@ func (e *encoder) encodeNumeric() { } // 0010b mode indicator -func (e *encoder) encodeAlphanumeric() { +func (e *encoder) encodeAlphanumeric(data []byte) { if e.dst == nil { log.Println("e.dst is nil") return } - for i := 0; i < len(e.data); i += 2 { - charsRemaining := len(e.data) - i + for i := 0; i < len(data); i += 2 { + charsRemaining := len(data) - i var value uint32 for j := 0; j < charsRemaining && j < 2; j++ { value *= 45 - value += encodeAlphanumericCharacter(e.data[i+j]) + value += encodeAlphanumericCharacter(data[i+j]) } bitsUsed := 6 @@ -169,16 +217,92 @@ func (e *encoder) encodeAlphanumeric() { } // 0100b mode indicator -func (e *encoder) encodeByte() { +func (e *encoder) encodeByte(data []byte) { if e.dst == nil { log.Println("e.dst is nil") return } - for _, b := range e.data { + for _, b := range data { _ = e.dst.AppendByte(b, 8) } } +// toShiftJIS converts Unicode string to Shift JIS and applies Kanji encoding. +// Each character is encoded as 13 bits using the QR Code Kanji mode algorithm. +// Reference: https://www.thonky.com/qr-code-tutorial/kanji-mode-encoding +func toShiftJIS(raw string) []byte { + enc := japanese.ShiftJIS.NewEncoder() + s2, _, err := transform.String(enc, raw) + if err != nil { + log.Printf("could not encode string to Shift JIS: %v", err) + return []byte{} + } + + data := []byte(s2) + if len(data)%2 != 0 { + // Kanji characters must encode to exactly 2 bytes in Shift JIS + log.Printf("shift JIS encoded data must be a multiple of 2, but got %d", len(data)) + return []byte{} + } + + for i := 0; i < len(data); i += 2 { + hi, lo := encodeShiftJIS(data[i], data[i+1]) + if hi == 0 && lo == 0 { + // Invalid character encountered + log.Printf("invalid Kanji character at position %d", i/2) + return []byte{} + } + data[i], data[i+1] = hi, lo + } + + return data +} + +func encodeShiftJIS(hi byte, lo byte) (byte, byte) { + r := uint16(hi)<<8 | uint16(lo) + + // QR Code Kanji mode supports Shift JIS ranges: + // 0x8140-0x9FFC and 0xE040-0xEBBF + if r >= 0x8140 && r <= 0x9FFC { + r -= 0x8140 + } else if r >= 0xE040 && r <= 0xEBBF { + r -= 0xC140 + } else { + // Not a valid QR Code Kanji character + log.Printf("'%c'(0x%x) not a valid QR Code Kanji character (must be in 0x8140-0x9FFC or 0xE040-0xEBBF)", r, r) + return 0, 0 + } + + hi = uint8(r >> 8) + lo = uint8(r & 0xFF) + + // Compress to 13-bit value: (high × 0xC0) + low + r = uint16(hi)*uint16(0xC0) + uint16(lo) + + return byte(r >> 8), byte(r & 0xFF) +} + +// encodeKanji encodes Kanji data (already processed by encodeShiftJIS). +// Each Kanji character is encoded as 13 bits: the data contains pairs of bytes +// where data[i] contains the high 5 bits and data[i+1] contains the low 8 bits. +func (e *encoder) encodeKanji(data []byte) { + // data must be a multiple of 2, since toShiftJIS encodes 1 char to 2 bytes + if len(data)%2 != 0 { + log.Println("data must be a multiple of 2") + return + } + + for i := 0; i < len(data); i += 2 { + // Reconstruct the 13-bit value: (high 5 bits << 8) | low 8 bits + // data[i] contains the high 5 bits of the 13-bit result + // data[i+1] contains the low 8 bits of the 13-bit result + value := uint32(data[i])<<8 | uint32(data[i+1]) + + // Append the 13-bit value to the bitstream + e.dst.AppendUint32(value, 13) + } +} + // Break Up into 8-bit Codewords and Add Pad Bytes if Necessary func (e *encoder) breakUpInto8bit() error { // fill ending code (max 4bit) @@ -221,15 +345,15 @@ var charCountMap = map[string]int{ "9_numeric": 10, "9_alphanumeric": 9, "9_byte": 8, - "9_japan": 8, + "9_kanji": 8, "26_numeric": 12, "26_alphanumeric": 11, "26_byte": 16, - "26_japan": 10, + "26_kanji": 10, "40_numeric": 14, "40_alphanumeric": 13, "40_byte": 16, - "40_japan": 12, + "40_kanji": 12, } // charCountBits @@ -282,70 +406,3 @@ func encodeAlphanumericCharacter(v byte) uint32 { return 0 } - -// analyzeEncFunc returns true is current byte matched in current mode, -// otherwise means you should use a bigger character set to check. -type analyzeEncFunc func(byte) bool - -// analyzeEncodeModeFromRaw try to detect letter set of input data, -// so that encoder can determine which mode should be use. -// reference: https://en.wikipedia.org/wiki/QR_code -// -// case1: only numbers, use EncModeNumeric. -// case2: could not use EncModeNumeric, but you can find all of them in character mapping, use EncModeAlphanumeric. -// case3: could not use EncModeAlphanumeric, but you can find all of them in ISO-8859-1 character set, use EncModeByte. -// case4: could not use EncModeByte, use EncModeJP, no more choice. -func analyzeEncodeModeFromRaw(raw []byte) encMode { - analyzeFnMapping := map[encMode]analyzeEncFunc{ - EncModeNumeric: analyzeNum, - EncModeAlphanumeric: analyzeAlphaNum, - EncModeByte: nil, - EncModeJP: nil, - } - - var ( - f analyzeEncFunc - mode = EncModeNumeric - ) - - // loop to check each character in raw data, - // from low mode to higher while current mode could bearing the input data. - for _, byt := range raw { - reAnalyze: - if f = analyzeFnMapping[mode]; f == nil { - break - } - - // issue#28 @borislavone reports this bug. - // FIXED(@yeqown): next encMode analyzeVersionAuto func did not check the previous byte, - // add goto statement to reanalyze previous byte which can't be analyzed in last encMode. - if !f(byt) { - mode <<= 1 - goto reAnalyze - } - } - - return mode -} - -// analyzeNum is byt in num encMode -func analyzeNum(byt byte) bool { - return byt >= '0' && byt <= '9' -} - -// analyzeAlphaNum is byt in alpha number -func analyzeAlphaNum(byt byte) bool { - if (byt >= '0' && byt <= '9') || (byt >= 'A' && byt <= 'Z') { - return true - } - switch byt { - case ' ', '$', '%', '*', '+', '-', '.', '/', ':': - return true - } - return false -} - -//// analyzeByte is byt in bytes. -//func analyzeByte(byt byte) qrbool { -// return false -//} diff --git a/encoder_test.go b/encoder_test.go index c89a459..0def202 100644 --- a/encoder_test.go +++ b/encoder_test.go @@ -1,13 +1,10 @@ package qrcode import ( + "bytes" "testing" ) -// func init() { -// load(defaultVersionCfg) -// } - func TestEncodeNum(t *testing.T) { enc := encoder{ ecLv: ErrorCorrectionLow, @@ -15,7 +12,7 @@ func TestEncodeNum(t *testing.T) { version: loadVersion(1, ErrorCorrectionLow), } - b, err := enc.Encode([]byte("12312312")) + b, err := enc.Encode("12312312") if err != nil { t.Errorf("could not encode: %v", err) t.Fail() @@ -30,7 +27,7 @@ func TestEncodeAlphanum(t *testing.T) { version: loadVersion(1, ErrorCorrectionLow), } - b, err := enc.Encode([]byte("AKJA*:/")) + b, err := enc.Encode("AKJA*:/") if err != nil { t.Errorf("could not encode: %v", err) t.Fail() @@ -45,7 +42,7 @@ func TestEncodeByte(t *testing.T) { version: loadVersion(5, ErrorCorrectionQuart), } - b, err := enc.Encode([]byte("http://baidu.com?keyword=123123")) + b, err := enc.Encode("http://baidu.com?keyword=123123") if err != nil { t.Errorf("could not encode: %v", err) t.Fail() @@ -53,159 +50,251 @@ func TestEncodeByte(t *testing.T) { t.Log(b, b.Len()) } -func Test_analyzeNum(t *testing.T) { +func Test_toShiftJIS(t *testing.T) { type args struct { - byt byte + s string } tests := []struct { name string args args - want bool + want []byte }{ { - name: "case 0", - args: args{byt: '0'}, - want: true, + name: "test 茗荷", + args: args{"茗荷"}, + want: []byte{0x1A, 0xAA, 0x06, 0x97}, }, { - name: "case 1", - args: args{byt: 'a'}, - want: false, + name: "test 世", + args: args{"世"}, + // Shift JIS: 0x90A2 + // 0x90A2 - 0x8140 = 0x0F62, hi=0x0F, lo=0x62 + // encoded = 0x0F*0xC0 + 0x62 = 0xBA2 + // high byte = 0x0B, low byte = 0xA2 + want: []byte{0x0B, 0xA2}, }, { - name: "case 2", - args: args{byt: 'A'}, - want: false, + name: "test 世界", + args: args{"世界"}, + // "世": [0x0B, 0xA2], "界": [0x06, 0xC5] + want: []byte{0x0B, 0xA2, 0x06, 0xC5}, }, { - name: "case 3", - args: args{byt: '9'}, - want: true, + name: "test 日本語", + args: args{"日本語"}, + // "日": [0x0E, 0x3A], "本": [0x0F, 0xFB], "語": [0x08, 0xEA] + want: []byte{0x0E, 0x3A, 0x0F, 0xFB, 0x08, 0xEA}, }, { - name: "case 4", - args: args{byt: '*'}, - want: false, + name: "test 漢字", + args: args{"漢字"}, + // "漢": [0x07, 0x3F], "字": [0x0A, 0x1A] + want: []byte{0x07, 0x3F, 0x0A, 0x1A}, }, } + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := analyzeNum(tt.args.byt); got != tt.want { - t.Errorf("analyzeNum() = %v, want %v", got, tt.want) + if got := toShiftJIS(tt.args.s); !bytes.Equal(got, tt.want) { + t.Errorf("toShiftJIS(%q) = %v, want %v", tt.args.s, got, tt.want) } }) } } -func Test_analyzeAlphanum(t *testing.T) { +func Test_encodeShiftJIS(t *testing.T) { type args struct { - byt byte + hi byte + lo byte } tests := []struct { - name string - args args - want bool + name string + args args + wantHi byte + wantLo byte }{ + // Range 1: 0x8140-0x9FFC + { + name: "lower boundary of range 1", + args: args{0x81, 0x40}, + wantHi: 0x00, + wantLo: 0x00, + }, + { + name: "middle of range 1 (世)", + args: args{0x90, 0xA2}, // "世" in Shift JIS + // 0x90A2 - 0x8140 = 0x0F62 + // high=0x0F, low=0x62 + // encoded = 0x0F*0xC0 + 0x62 = 0xBA2 + wantHi: 0x0B, + wantLo: 0xA2, + }, { - name: "case 0", - args: args{byt: '0'}, - want: true, + name: "upper boundary of range 1", + args: args{0x9F, 0xFC}, + // 0x9FFC - 0x8140 = 0x1EBC + // high=0x1E, low=0xBC + // encoded = 0x1E*0xC0 + 0xBC = 0x173C + wantHi: 0x17, + wantLo: 0x3C, }, + // Range 2: 0xE040-0xEBBF { - name: "case 1", - args: args{byt: 'a'}, - want: false, + name: "lower boundary of range 2", + args: args{0xE0, 0x40}, + // 0xE040 - 0xC140 = 0x1F00 + // high=0x1F, low=0x00 + // encoded = 0x1F*0xC0 + 0x00 = 0x1740 + wantHi: 0x17, + wantLo: 0x40, }, { - name: "case 2", - args: args{byt: 'A'}, - want: true, + name: "middle of range 2", + args: args{0xE4, 0xAA}, + // 0xE4AA - 0xC140 = 0x236A + // high=0x23, low=0x6A + // encoded = 0x23*0xC0 + 0x6A = 0x1AAA + wantHi: 0x1A, + wantLo: 0xAA, }, { - name: "case 3", - args: args{byt: '9'}, - want: true, + name: "upper boundary of range 2", + args: args{0xEB, 0xBF}, + // 0xEBBF - 0xC140 = 0x2A7F + // high=0x2A, low=0x7F + // encoded = 0x2A*0xC0 + 0x7F = 0x1FFF + wantHi: 0x1F, + wantLo: 0xFF, }, + // Invalid ranges { - name: "case 4", - args: args{byt: '*'}, - want: true, + name: "below range 1", + args: args{0x80, 0x00}, + wantHi: 0x00, + wantLo: 0x00, }, { - name: "case 5", - args: args{byt: '?'}, - want: false, + name: "between ranges", + args: args{0x9F, 0xFD}, + wantHi: 0x00, + wantLo: 0x00, }, { - name: "case 6", - args: args{byt: '&'}, - want: false, + name: "above range 2", + args: args{0xEC, 0x00}, + wantHi: 0x00, + wantLo: 0x00, }, } + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := analyzeAlphaNum(tt.args.byt); got != tt.want { - t.Errorf("analyzeAlphaNum() = %v, want %v", got, tt.want) + gotHi, gotLo := encodeShiftJIS(tt.args.hi, tt.args.lo) + if gotHi != tt.wantHi || gotLo != tt.wantLo { + t.Errorf("encodeShiftJIS(0x%02X, 0x%02X) = (0x%02X, 0x%02X), want (0x%02X, 0x%02X)", + tt.args.hi, tt.args.lo, gotHi, gotLo, tt.wantHi, tt.wantLo) } }) } } -func Test_anlayzeMode(t *testing.T) { - type args struct { - raw []byte - } +func TestEncodeKanji(t *testing.T) { tests := []struct { - name string - args args - want encMode + name string + input string + wantLen int // Expected bit length for encoded data (13 bits per character) }{ { - name: "case 0", - args: args{raw: []byte("123120899231")}, - want: EncModeNumeric, - }, - { - name: "case 1", - args: args{raw: []byte(":/1231H208*99231FBJO")}, - want: EncModeAlphanumeric, + name: "single character 世", + input: "世", + wantLen: 13, }, { - name: "case 2", - args: args{raw: []byte("hahah1298312hG&^FBJO@jhgG*")}, - want: EncModeByte, + name: "two characters 世界", + input: "世界", + wantLen: 26, }, { - name: "case 3", - args: args{raw: []byte("JKAHDOIANKQOIHCMJKASJ")}, - want: EncModeAlphanumeric, - }, - { - name: "case 4", - args: args{raw: []byte("https://baidu.com?keyword=_JSO==GA")}, - want: EncModeByte, + name: "four characters 日本語", + input: "日本語", + wantLen: 39, }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + enc := encoder{ + ecLv: ErrorCorrectionLow, + mode: EncModeKanji, + version: loadVersion(1, ErrorCorrectionLow), + } + + b, err := enc.Encode(tt.input) + if err != nil { + t.Errorf("could not encode: %v", err) + t.Fail() + } + + // The total length includes mode indicator (4), char count (8 for v1-9), + // data bits (wantLen), and padding to fill the codeword capacity + t.Logf("Encode(%q) total bits: %d", tt.input, b.Len()) + + // Check that we successfully encoded the Kanji data + // The mode indicator is 1000 (4 bits), char count is 8 bits for version 1 + // So the first 12 bits should be: 1000 (mode) + char count (8 bits) + // For a single character, char count = 1 = 00000001 + // First 12 bits: 1000 00000001 + }) + } +} + +func TestEncodeKanji_Version(t *testing.T) { + tests := []struct { + name string + input string + version int + expectedCharCountBits int + }{ { - name: "case 5", - args: args{raw: []byte("这是汉字也应该是EncModeByte")}, - want: EncModeByte, + name: "version 1", + input: "漢字", + version: 1, + expectedCharCountBits: 8, }, { - name: "case 6 (swedish letter)", - args: args{raw: []byte("Övrigt aksldjlk Övrigt should JP encMode?")}, - want: EncModeByte, + name: "version 10", + input: "漢字", + version: 10, + expectedCharCountBits: 10, }, { - name: "issue#28", - args: args{raw: []byte("a")}, - want: EncModeByte, + name: "version 27", + input: "漢字", + version: 27, + expectedCharCountBits: 12, }, } + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := analyzeEncodeModeFromRaw(tt.args.raw); got != tt.want { - t.Errorf("analyzeEncodeModeFromRaw() = %v, want %v", got, tt.want) + enc := encoder{ + ecLv: ErrorCorrectionLow, + mode: EncModeKanji, + version: loadVersion(tt.version, ErrorCorrectionLow), } + + charCountBits := enc.charCountBits() + if charCountBits != tt.expectedCharCountBits { + t.Errorf("charCountBits() = %d, want %d", charCountBits, tt.expectedCharCountBits) + } + + b, err := enc.Encode(tt.input) + if err != nil { + t.Errorf("could not encode: %v", err) + t.Fail() + } + + t.Logf("Encode(%q) with version %d = %v, total bits: %d", tt.input, tt.version, b, b.Len()) }) } } diff --git a/go.mod b/go.mod index 12f16e9..1d65435 100644 --- a/go.mod +++ b/go.mod @@ -10,5 +10,6 @@ require ( require ( github.com/davecgh/go-spew v1.1.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + golang.org/x/text v0.22.0 // indirect gopkg.in/yaml.v3 v3.0.0 // indirect ) diff --git a/go.sum b/go.sum index ff1802e..ae1592c 100644 --- a/go.sum +++ b/go.sum @@ -7,6 +7,8 @@ github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5Cc github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/yeqown/reedsolomon v1.0.0 h1:x1h/Ej/uJnNu8jaX7GLHBWmZKCAWjEJTetkqaabr4B0= github.com/yeqown/reedsolomon v1.0.0/go.mod h1:P76zpcn2TCuL0ul1Fso373qHRc69LKwAw/Iy6g1WiiM= +golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= +golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/go.work.sum b/go.work.sum index 9bb2472..2c37ecc 100644 --- a/go.work.sum +++ b/go.work.sum @@ -52,20 +52,21 @@ golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ= golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8= golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8= golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.6.0 h1:L4ZwwTvKW9gr0ZMS1yrHD9GZhIuVjOBBnaKH+SPQK0Q= golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= +golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= golang.org/x/term v0.5.0 h1:n2a8QNdAb0sZNpU9R1ALUXBbY+w51fCQDN+7EdxNBsY= golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4= golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= -golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= -golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7 h1:9zdDQZ7Thm29KFXgAX/+yaf3eVbP7djjWp/dXAppNCc= gopkg.in/yaml.v2 v2.2.3 h1:fvjTMHxHEw/mxHbtzPi3JCcKXQRAnQTBRo6YCJSVHKI= gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/mask_test.go b/mask_test.go index 220bef1..691a775 100644 --- a/mask_test.go +++ b/mask_test.go @@ -8,7 +8,7 @@ import ( func TestMask(t *testing.T) { qrc := &QRCode{ - sourceRawBytes: []byte("baidu.com google.com qq.com sina.com apple.com"), + sourceText: "baidu.com google.com qq.com sina.com apple.com", encodingOption: DefaultEncodingOption(), } err := qrc.init() diff --git a/qrcode.go b/qrcode.go index ee9b7a5..0852331 100644 --- a/qrcode.go +++ b/qrcode.go @@ -38,9 +38,38 @@ func toBytes[T ~string | ~[]byte](v T) []byte { } } +// validateEncodingMode checks if the specified encoding mode is compatible with the input text. +// Returns an error if the text contains characters that cannot be encoded in the specified mode. +func validateEncodingMode(mode encMode, text string) error { + var analyzeFn analyzeEncFunc + + switch mode { + case EncModeNumeric: + analyzeFn = analyzeNum + case EncModeAlphanumeric: + analyzeFn = analyzeAlphaNum + case EncModeKanji: + analyzeFn = analyzeJP + case EncModeByte: + // Byte mode can encode any character + return nil + default: + return nil + } + + for _, r := range text { + if !analyzeFn(r) { + return fmt.Errorf("character '%c' (U+%04X) cannot be encoded in %s mode", + r, r, getEncModeName(mode)) + } + } + + return nil +} + func build(raw []byte, option *encodingOption) (*QRCode, error) { qrc := &QRCode{ - sourceRawBytes: raw, + sourceText: string(raw), dataBSet: nil, mat: nil, ecBSet: nil, @@ -61,7 +90,7 @@ func build(raw []byte, option *encodingOption) (*QRCode, error) { // QRCode contains fields to generate QRCode matrix, outputImageOptions to Draw image, // etc. type QRCode struct { - sourceRawBytes []byte // raw Data to transfer + sourceText string // sourceText input text dataBSet *binary.Binary // final data bit stream of encode data mat *Matrix // matrix grid to store final bitmap @@ -79,7 +108,7 @@ func (q *QRCode) Save(w Writer) error { defer func() { if err := w.Close(); err != nil { - log.Printf("[WARNNING] [go-qrcode] close writer failed: %v\n", err) + log.Printf("[WARNING] [go-qrcode] close writer failed: %v\n", err) } }() @@ -98,7 +127,15 @@ func (q *QRCode) Dimension() int { func (q *QRCode) init() (err error) { // choose encode mode (num, alpha num, byte, Japanese) if q.encodingOption.EncMode == EncModeAuto { - q.encodingOption.EncMode = analyzeEncodeModeFromRaw(q.sourceRawBytes) + q.encodingOption.EncMode, err = analyzeEncodeModeFromRaw(q.sourceText) + if err != nil { + return fmt.Errorf("init: analyze encode mode failed: %v", err) + } + } else { + // Validate that the specified encoding mode is compatible with the input + if err = validateEncodingMode(q.encodingOption.EncMode, q.sourceText); err != nil { + return err + } } // choose version @@ -149,7 +186,7 @@ func (q *QRCode) calcVersion() (ver *version, err error) { // automatically parse version if needAnalyze { // analyzeVersion the input data to choose to adapt version - analyzed, err2 := analyzeVersion(q.sourceRawBytes, opt.EcLevel, opt.EncMode) + analyzed, err2 := analyzeVersion(q.sourceText, opt.EcLevel, opt.EncMode) if err2 != nil { err = fmt.Errorf("calcVersion: analyzeVersionAuto failed: %v", err2) return nil, err @@ -180,7 +217,7 @@ func (q *QRCode) dataEncoding() (blocks []dataBlock, err error) { var ( bset *binary.Binary ) - bset, err = q.encoder.Encode(q.sourceRawBytes) + bset, err = q.encoder.Encode(q.sourceText) if err != nil { err = fmt.Errorf("could not encode data: %v", err) return diff --git a/qrcode_test.go b/qrcode_test.go index 9feb555..c7aa8bd 100644 --- a/qrcode_test.go +++ b/qrcode_test.go @@ -23,17 +23,12 @@ func Test_NewWith(t *testing.T) { qrc.mat.print() } -// Test_NewWithConfig_UnmatchedEncodeMode NewWith will panic while encMode is -// not matched to Config.EncMode, for example: -// cfg.EncMode is EncModeAlphanumeric but source text is bytes encoding. +// Test_NewWithConfig_UnmatchedEncodeMode tests that explicit encoding mode +// returns error when input contains characters that cannot be encoded. func Test_NewWithConfig_UnmatchedEncodeMode(t *testing.T) { - assert.Panics(t, func() { - _, err := NewWith("abcs", WithEncodingMode(EncModeAlphanumeric)) - if err != nil { - t.Errorf("could not generate QRCode: %v", err) - t.Fail() - } - }) + // Lowercase letters with Alphanumeric mode should return error + _, err := NewWith("abcs", WithEncodingMode(EncModeAlphanumeric)) + assert.Error(t, err, "expected error when using lowercase letters with Alphanumeric mode") } func Benchmark_NewQRCode_1KB(b *testing.B) { @@ -148,3 +143,168 @@ func Test_NewWith_MinimumVersion_WithExplicitVersion(t *testing.T) { // WithVersion takes precedence, so version should be 10 assert.Equal(t, 10, qrc.v.Ver) } + +// Test_NewWith_Kanji_EncMode tests Kanji mode encoding with explicit mode setting +func Test_NewWith_Kanji_EncMode(t *testing.T) { + tests := []struct { + name string + text string + wantErr bool + expectedErr string + }{ + // Valid Kanji input + { + name: "single Kanji character", + text: "漢", + wantErr: false, + }, + { + name: "multiple Kanji characters", + text: "漢字", + wantErr: false, + }, + { + name: "Kanji sentence", + text: "日本語", + wantErr: false, + }, + { + name: "Kanji mixed characters", + text: "世界", + wantErr: false, + }, + // Invalid input for Kanji mode + { + name: "ASCII characters", + text: "https://google.com", + wantErr: true, + expectedErr: "cannot be encoded in kanji mode", + }, + { + name: "numbers with Kanji mode", + text: "漢字123", + wantErr: true, + expectedErr: "cannot be encoded in kanji mode", + }, + { + name: "Hiragana with Kanji mode", + text: "こんにちは", + wantErr: true, + expectedErr: "cannot be encoded in kanji mode", + }, + { + name: "Katakana with Kanji mode", + text: "コンニチハ", + wantErr: true, + expectedErr: "cannot be encoded in kanji mode", + }, + { + name: "mixed Kanji and ASCII", + text: "漢字test", + wantErr: true, + expectedErr: "cannot be encoded in kanji mode", + }, + { + name: "CJK Extension A character", + text: "㐀", + wantErr: true, + expectedErr: "cannot be encoded in kanji mode", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + qrc, err := NewWith(tt.text, + WithEncodingMode(EncModeKanji), + WithErrorCorrectionLevel(ErrorCorrectionLow), + ) + + if tt.wantErr { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.expectedErr) + return + } + + require.NoError(t, err) + assert.NotNil(t, qrc) + assert.Equal(t, EncModeKanji, qrc.encoder.mode) + t.Logf("Kanji QR code for '%s': version=%d", tt.text, qrc.v.Ver) + }) + } +} + +// Test_NewWith_Kanji_AutoMode tests automatic Kanji mode detection +func Test_NewWith_Kanji_AutoMode(t *testing.T) { + tests := []struct { + name string + text string + expected encMode + }{ + { + name: "Kanji only - auto detect", + text: "漢字", + expected: EncModeKanji, + }, + { + name: "Kanji characters - auto detect", + text: "世界", + expected: EncModeKanji, + }, + { + name: "Mixed ASCII and Katakana - auto detect Byte mode", + text: "QRコード123", + expected: EncModeByte, + }, + { + name: "Pure Kanji text - auto detect", + text: "金木水火土日月星", + expected: EncModeKanji, + }, + { + name: "Long Kanji text - auto detect", + text: "東京京都大阪北海道沖縄鹿児島", + expected: EncModeKanji, + }, + { + name: "Hiragana only - auto detect Byte mode", + text: "これはひらがなです", + expected: EncModeByte, // Hiragana is not supported in Kanji mode + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Use EncModeAuto to let the library detect the mode automatically + qrc, err := NewWith(tt.text, + WithEncodingMode(EncModeAuto), + WithErrorCorrectionLevel(ErrorCorrectionLow), + ) + require.NoError(t, err) + assert.NotNil(t, qrc) + + // Verify the detected mode matches expectation + assert.Equal(t, tt.expected, qrc.encoder.mode, + "Expected mode %v for text '%s', got %v", tt.expected, tt.text, qrc.encoder.mode) + + t.Logf("Auto-detected mode for '%s': %v, version=%d", tt.text, getEncModeName(qrc.encoder.mode), qrc.v.Ver) + }) + } +} + +// Test_NewWith_Kanji_Version10 tests Kanji encoding with specific version +func Test_NewWith_Kanji_Version10(t *testing.T) { + qrc, err := NewWith("漢字文字試験", + WithEncodingMode(EncModeKanji), + WithVersion(10), + WithErrorCorrectionLevel(ErrorCorrectionLow), + ) + require.NoError(t, err) + assert.NotNil(t, qrc) + + // Verify version is set correctly + assert.Equal(t, 10, qrc.v.Ver) + // Verify encoding mode is Kanji + assert.Equal(t, EncModeKanji, qrc.encoder.mode) + + t.Logf("Kanji QR code with version 10: matrix dimension=%d", qrc.mat.Width()) +} diff --git a/version.go b/version.go index 641c65b..cad7d2a 100644 --- a/version.go +++ b/version.go @@ -7,6 +7,8 @@ import ( "sync" // "github.com/skip2/go-qrcode/bitset" + "unicode/utf8" + "github.com/yeqown/reedsolomon/binary" ) @@ -273,7 +275,7 @@ func loadVersion(lv int, ec ecLevel) version { // // check out http://muyuchengfeng.xyz/%E4%BA%8C%E7%BB%B4%E7%A0%81-%E5%AD%97%E7%AC%A6%E5%AE%B9%E9%87%8F%E8%A1%A8/ // for more details. -func analyzeVersion(raw []byte, ec ecLevel, mode encMode) (*version, error) { +func analyzeVersion(raw string, ec ecLevel, mode encMode) (*version, error) { step := 0 switch ec { case ErrorCorrectionLow: @@ -288,7 +290,16 @@ func analyzeVersion(raw []byte, ec ecLevel, mode encMode) (*version, error) { return nil, errInvalidErrorCorrectionLevel } - want, mark := len(raw), 0 + // Byte mode capacity is measured in bytes, not characters + // Numeric, Alphanumeric, and Kanji modes are character-based + var want int + if mode == EncModeByte { + want = len(raw) + } else { + want = utf8.RuneCountInString(raw) + } + + mark := 0 for ; step < 160; step += 4 { switch mode { @@ -298,7 +309,7 @@ func analyzeVersion(raw []byte, ec ecLevel, mode encMode) (*version, error) { mark = versions[step].Cap.AlphaNumeric case EncModeByte: mark = versions[step].Cap.Byte - case EncModeJP: + case EncModeKanji: mark = versions[step].Cap.JP default: return nil, errMissMatchedEncodeType diff --git a/version_test.go b/version_test.go index 10555f1..9af578f 100644 --- a/version_test.go +++ b/version_test.go @@ -70,7 +70,7 @@ func Test_analyzeVersion(t *testing.T) { v3 := loadVersion(23, ErrorCorrectionMedium) type args struct { - raw []byte + raw string ecLv ecLevel eMode encMode } @@ -83,7 +83,7 @@ func Test_analyzeVersion(t *testing.T) { { name: "case 0", args: args{ - raw: []byte("TEXT"), + raw: "TEXT", ecLv: ErrorCorrectionMedium, eMode: EncModeAlphanumeric, }, @@ -93,7 +93,7 @@ func Test_analyzeVersion(t *testing.T) { { name: "case 1", args: args{ - raw: []byte(strings.Repeat("TEXT", 30)), + raw: strings.Repeat("TEXT", 30), ecLv: ErrorCorrectionMedium, eMode: EncModeAlphanumeric, }, @@ -103,7 +103,7 @@ func Test_analyzeVersion(t *testing.T) { { name: "case 2", args: args{ - raw: []byte(strings.Repeat("TEXT", 300)), + raw: strings.Repeat("TEXT", 300), ecLv: ErrorCorrectionMedium, eMode: EncModeAlphanumeric, }, @@ -282,7 +282,7 @@ func Benchmark_loadVersion_bottom(b *testing.B) { } func Benchmark_analyzeVersion_short(b *testing.B) { - source := []byte("text") + source := "text" for i := 0; i < b.N; i++ { _, _ = analyzeVersion(source, ErrorCorrectionMedium, EncModeByte) @@ -290,7 +290,7 @@ func Benchmark_analyzeVersion_short(b *testing.B) { } func Benchmark_analyzeVersion_middle(b *testing.B) { - source := []byte(strings.Repeat("text", 30)) + source := strings.Repeat("text", 30) for i := 0; i < b.N; i++ { _, _ = analyzeVersion(source, ErrorCorrectionMedium, EncModeByte) @@ -298,7 +298,7 @@ func Benchmark_analyzeVersion_middle(b *testing.B) { } func Benchmark_analyzeVersion_long(b *testing.B) { - source := []byte(strings.Repeat("text", 300)) + source := strings.Repeat("text", 300) for i := 0; i < b.N; i++ { _, _ = analyzeVersion(source, ErrorCorrectionMedium, EncModeByte)