mirror of
https://github.com/golang/go.git
synced 2025-05-29 19:35:42 +00:00
first cut at case mapping tables and library.
next cut will do the optimization for alternating sequences. R=rsc DELTA=1658 (1620 added, 9 deleted, 29 changed) OCL=34072 CL=34075
This commit is contained in:
parent
30dcb13420
commit
22c2b476a8
@ -103,12 +103,12 @@ var testLetter = []int {
|
|||||||
func TestDigit(t *testing.T) {
|
func TestDigit(t *testing.T) {
|
||||||
for i, r := range testDigit {
|
for i, r := range testDigit {
|
||||||
if !IsDigit(r) {
|
if !IsDigit(r) {
|
||||||
t.Errorf("IsDigit(%#x) = false, want true\n", r);
|
t.Errorf("IsDigit(U+%04X) = false, want true\n", r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i, r := range testLetter {
|
for i, r := range testLetter {
|
||||||
if IsDigit(r) {
|
if IsDigit(r) {
|
||||||
t.Errorf("IsDigit(%#x) = true, want false\n", r);
|
t.Errorf("IsDigit(U+%04X) = true, want false\n", r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,11 +9,39 @@ package unicode
|
|||||||
// The representation of a range of Unicode code points. The range runs from Lo to Hi
|
// The representation of a range of Unicode code points. The range runs from Lo to Hi
|
||||||
// inclusive and has the specified stride.
|
// inclusive and has the specified stride.
|
||||||
type Range struct {
|
type Range struct {
|
||||||
Lo int;
|
Lo int;
|
||||||
Hi int;
|
Hi int;
|
||||||
Stride int;
|
Stride int;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The representation of a range of Unicode code points for case conversion.
|
||||||
|
// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
|
||||||
|
// are the number to add to the code point to reach the code point for a
|
||||||
|
// different case for that character. They may be negative. If zero, it
|
||||||
|
// means the character is in the corresponding case.
|
||||||
|
type CaseRange struct {
|
||||||
|
Lo int;
|
||||||
|
Hi int;
|
||||||
|
Delta d;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indices into the Delta arrays inside CaseRanges for case mapping.
|
||||||
|
const (
|
||||||
|
UpperCase = iota;
|
||||||
|
LowerCase;
|
||||||
|
TitleCase;
|
||||||
|
MaxCase;
|
||||||
|
)
|
||||||
|
type d [MaxCase]int32 // to make the CaseRanges text shorter
|
||||||
|
|
||||||
|
// If the Delta field of a CaseRange is UpperLower or LowerUpper, it means
|
||||||
|
// this CaseRange represents a sequence of the form (say)
|
||||||
|
// Upper Lower Upper Lower.
|
||||||
|
const (
|
||||||
|
UpperLower = 1;
|
||||||
|
LowerUpper = -1;
|
||||||
|
)
|
||||||
|
|
||||||
// Is tests whether rune is in the specified table of ranges.
|
// Is tests whether rune is in the specified table of ranges.
|
||||||
func Is(ranges []Range, rune int) bool {
|
func Is(ranges []Range, rune int) bool {
|
||||||
// common case: rune is ASCII or Latin-1
|
// common case: rune is ASCII or Latin-1
|
||||||
@ -80,3 +108,59 @@ func IsLetter(rune int) bool {
|
|||||||
}
|
}
|
||||||
return Is(Letter, rune);
|
return Is(Letter, rune);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// To maps the rune to the specified case, UpperCase, LowerCase, or TitleCase
|
||||||
|
func To(_case int, rune int) int {
|
||||||
|
if _case < 0 || MaxCase <= _case {
|
||||||
|
return 0xFFFD // as reasonable an error as any
|
||||||
|
}
|
||||||
|
// binary search over ranges
|
||||||
|
lo := 0;
|
||||||
|
hi := len(CaseRanges);
|
||||||
|
for lo < hi {
|
||||||
|
m := lo + (hi - lo)/2;
|
||||||
|
r := CaseRanges[m];
|
||||||
|
if r.Lo <= rune && rune <= r.Hi {
|
||||||
|
return rune + int(r.Delta[_case]);
|
||||||
|
}
|
||||||
|
if rune < r.Lo {
|
||||||
|
hi = m;
|
||||||
|
} else {
|
||||||
|
lo = m+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rune;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToUpper maps the rune to upper case
|
||||||
|
func ToUpper(rune int) int {
|
||||||
|
if rune < 0x80 { // quick ASCII check
|
||||||
|
if 'a' <= rune && rune <= 'z' {
|
||||||
|
rune &^= ' '
|
||||||
|
}
|
||||||
|
return rune
|
||||||
|
}
|
||||||
|
return To(UpperCase, rune);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToLower maps the rune to lower case
|
||||||
|
func ToLower(rune int) int {
|
||||||
|
if rune < 0x80 { // quick ASCII check
|
||||||
|
if 'A' <= rune && rune <= 'Z' {
|
||||||
|
rune |= ' '
|
||||||
|
}
|
||||||
|
return rune
|
||||||
|
}
|
||||||
|
return To(LowerCase, rune);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToTitle maps the rune to title case
|
||||||
|
func ToTitle(rune int) int {
|
||||||
|
if rune < 0x80 { // quick ASCII check
|
||||||
|
if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII
|
||||||
|
rune &^= ' '
|
||||||
|
}
|
||||||
|
return rune
|
||||||
|
}
|
||||||
|
return To(TitleCase, rune);
|
||||||
|
}
|
||||||
|
@ -89,20 +89,127 @@ var notletterTest = []int{
|
|||||||
0x10ffff,
|
0x10ffff,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type caseT struct {
|
||||||
|
cas, in, out int
|
||||||
|
}
|
||||||
|
|
||||||
|
var caseTest = []caseT {
|
||||||
|
// errors
|
||||||
|
caseT{-1, '\n', 0xFFFD},
|
||||||
|
caseT{UpperCase, -1, -1},
|
||||||
|
caseT{UpperCase, 1<<30, 1<<30},
|
||||||
|
|
||||||
|
// ASCII (special-cased so test carefully)
|
||||||
|
caseT{UpperCase, '\n', '\n'},
|
||||||
|
caseT{UpperCase, 'a', 'A'},
|
||||||
|
caseT{UpperCase, 'A', 'A'},
|
||||||
|
caseT{UpperCase, '7', '7'},
|
||||||
|
caseT{LowerCase, '\n', '\n'},
|
||||||
|
caseT{LowerCase, 'a', 'a'},
|
||||||
|
caseT{LowerCase, 'A', 'a'},
|
||||||
|
caseT{LowerCase, '7', '7'},
|
||||||
|
caseT{TitleCase, '\n', '\n'},
|
||||||
|
caseT{TitleCase, 'a', 'A'},
|
||||||
|
caseT{TitleCase, 'A', 'A'},
|
||||||
|
caseT{TitleCase, '7', '7'},
|
||||||
|
|
||||||
|
// Latin-1: easy to read the tests!
|
||||||
|
caseT{UpperCase, 0x80, 0x80},
|
||||||
|
caseT{UpperCase, 'Å', 'Å'},
|
||||||
|
caseT{UpperCase, 'å', 'Å'},
|
||||||
|
caseT{LowerCase, 0x80, 0x80},
|
||||||
|
caseT{LowerCase, 'Å', 'å'},
|
||||||
|
caseT{LowerCase, 'å', 'å'},
|
||||||
|
caseT{TitleCase, 0x80, 0x80},
|
||||||
|
caseT{TitleCase, 'Å', 'Å'},
|
||||||
|
caseT{TitleCase, 'å', 'Å'},
|
||||||
|
|
||||||
|
// 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
|
||||||
|
caseT{UpperCase, 0x0131, 'I'},
|
||||||
|
caseT{LowerCase, 0x0131, 0x0131},
|
||||||
|
caseT{TitleCase, 0x0131, 'I'},
|
||||||
|
|
||||||
|
// 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
|
||||||
|
caseT{UpperCase, 0x0133, 0x0132},
|
||||||
|
caseT{LowerCase, 0x0133, 0x0133},
|
||||||
|
caseT{TitleCase, 0x0133, 0x0132},
|
||||||
|
|
||||||
|
// 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
|
||||||
|
caseT{UpperCase, 0x212A, 0x212A},
|
||||||
|
caseT{LowerCase, 0x212A, 'k'},
|
||||||
|
caseT{TitleCase, 0x212A, 0x212A},
|
||||||
|
|
||||||
|
// From an UpperLower sequence
|
||||||
|
// A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
|
||||||
|
caseT{UpperCase, 0xA640, 0xA640},
|
||||||
|
caseT{LowerCase, 0xA640, 0xA641},
|
||||||
|
caseT{TitleCase, 0xA640, 0xA640},
|
||||||
|
// A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
|
||||||
|
caseT{UpperCase, 0xA641, 0xA640},
|
||||||
|
caseT{LowerCase, 0xA641, 0xA641},
|
||||||
|
caseT{TitleCase, 0xA641, 0xA640},
|
||||||
|
// A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
|
||||||
|
caseT{UpperCase, 0xA64E, 0xA64E},
|
||||||
|
caseT{LowerCase, 0xA64E, 0xA64F},
|
||||||
|
caseT{TitleCase, 0xA64E, 0xA64E},
|
||||||
|
// A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
|
||||||
|
caseT{UpperCase, 0xA65F, 0xA65E},
|
||||||
|
caseT{LowerCase, 0xA65F, 0xA65F},
|
||||||
|
caseT{TitleCase, 0xA65F, 0xA65E},
|
||||||
|
|
||||||
|
// From a LowerUpper sequence
|
||||||
|
// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
|
||||||
|
caseT{UpperCase, 0x0139, 0x0139},
|
||||||
|
caseT{LowerCase, 0x0139, 0x013A},
|
||||||
|
caseT{TitleCase, 0x0139, 0x0139},
|
||||||
|
// 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
|
||||||
|
caseT{UpperCase, 0x013f, 0x013f},
|
||||||
|
caseT{LowerCase, 0x013f, 0x0140},
|
||||||
|
caseT{TitleCase, 0x013f, 0x013f},
|
||||||
|
// 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
|
||||||
|
caseT{UpperCase, 0x0148, 0x0147},
|
||||||
|
caseT{LowerCase, 0x0148, 0x0148},
|
||||||
|
caseT{TitleCase, 0x0148, 0x0147},
|
||||||
|
|
||||||
|
// Last block in the 5.1.0 table
|
||||||
|
// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
|
||||||
|
caseT{UpperCase, 0x10400, 0x10400},
|
||||||
|
caseT{LowerCase, 0x10400, 0x10428},
|
||||||
|
caseT{TitleCase, 0x10400, 0x10400},
|
||||||
|
// 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
|
||||||
|
caseT{UpperCase, 0x10427, 0x10427},
|
||||||
|
caseT{LowerCase, 0x10427, 0x1044F},
|
||||||
|
caseT{TitleCase, 0x10427, 0x10427},
|
||||||
|
// 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
|
||||||
|
caseT{UpperCase, 0x10428, 0x10400},
|
||||||
|
caseT{LowerCase, 0x10428, 0x10428},
|
||||||
|
caseT{TitleCase, 0x10428, 0x10400},
|
||||||
|
// 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
|
||||||
|
caseT{UpperCase, 0x1044F, 0x10427},
|
||||||
|
caseT{LowerCase, 0x1044F, 0x1044F},
|
||||||
|
caseT{TitleCase, 0x1044F, 0x10427},
|
||||||
|
|
||||||
|
// First one not in the 5.1.0 table
|
||||||
|
// 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
|
||||||
|
caseT{UpperCase, 0x10450, 0x10450},
|
||||||
|
caseT{LowerCase, 0x10450, 0x10450},
|
||||||
|
caseT{TitleCase, 0x10450, 0x10450},
|
||||||
|
}
|
||||||
|
|
||||||
func TestIsLetter(t *testing.T) {
|
func TestIsLetter(t *testing.T) {
|
||||||
for i, r := range upperTest {
|
for i, r := range upperTest {
|
||||||
if !IsLetter(r) {
|
if !IsLetter(r) {
|
||||||
t.Errorf("IsLetter(%#x) = false, want true\n", r);
|
t.Errorf("IsLetter(U+%04X) = false, want true\n", r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i, r := range letterTest {
|
for i, r := range letterTest {
|
||||||
if !IsLetter(r) {
|
if !IsLetter(r) {
|
||||||
t.Errorf("IsLetter(%#x) = false, want true\n", r);
|
t.Errorf("IsLetter(U+%04X) = false, want true\n", r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i, r := range notletterTest {
|
for i, r := range notletterTest {
|
||||||
if IsLetter(r) {
|
if IsLetter(r) {
|
||||||
t.Errorf("IsLetter(%#x) = true, want false\n", r);
|
t.Errorf("IsLetter(U+%04X) = true, want false\n", r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -110,17 +217,74 @@ func TestIsLetter(t *testing.T) {
|
|||||||
func TestIsUpper(t *testing.T) {
|
func TestIsUpper(t *testing.T) {
|
||||||
for i, r := range upperTest {
|
for i, r := range upperTest {
|
||||||
if !IsUpper(r) {
|
if !IsUpper(r) {
|
||||||
t.Errorf("IsUpper(%#x) = false, want true\n", r);
|
t.Errorf("IsUpper(U+%04X) = false, want true\n", r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i, r := range notupperTest {
|
for i, r := range notupperTest {
|
||||||
if IsUpper(r) {
|
if IsUpper(r) {
|
||||||
t.Errorf("IsUpper(%#x) = true, want false\n", r);
|
t.Errorf("IsUpper(U+%04X) = true, want false\n", r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i, r := range notletterTest {
|
for i, r := range notletterTest {
|
||||||
if IsUpper(r) {
|
if IsUpper(r) {
|
||||||
t.Errorf("IsUpper(%#x) = true, want false\n", r);
|
t.Errorf("IsUpper(U+%04X) = true, want false\n", r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseString(c int) string {
|
||||||
|
switch c {
|
||||||
|
case UpperCase:
|
||||||
|
return "UpperCase"
|
||||||
|
case LowerCase:
|
||||||
|
return "LowerCase"
|
||||||
|
case TitleCase:
|
||||||
|
return "TitleCase"
|
||||||
|
}
|
||||||
|
return "ErrorCase"
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTo(t *testing.T) {
|
||||||
|
for i, c := range caseTest {
|
||||||
|
r := To(c.cas, c.in);
|
||||||
|
if c.out != r {
|
||||||
|
t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X\n", c.in, caseString(c.cas), r, c.out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToUpperCase(t *testing.T) {
|
||||||
|
for i, c := range caseTest {
|
||||||
|
if c.cas != UpperCase {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
r := ToUpper(c.in);
|
||||||
|
if c.out != r {
|
||||||
|
t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X\n", c.in, r, c.out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToLowerCase(t *testing.T) {
|
||||||
|
for i, c := range caseTest {
|
||||||
|
if c.cas != LowerCase {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
r := ToLower(c.in);
|
||||||
|
if c.out != r {
|
||||||
|
t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X\n", c.in, r, c.out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToTitleCase(t *testing.T) {
|
||||||
|
for i, c := range caseTest {
|
||||||
|
if c.cas != TitleCase {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
r := ToTitle(c.in);
|
||||||
|
if c.out != r {
|
||||||
|
t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X\n", c.in, r, c.out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,6 +21,14 @@ import (
|
|||||||
"unicode";
|
"unicode";
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
flag.Parse();
|
||||||
|
loadChars(); // always needed
|
||||||
|
printCategories();
|
||||||
|
printScripts();
|
||||||
|
printCases();
|
||||||
|
}
|
||||||
|
|
||||||
var dataUrl = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt");
|
var dataUrl = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt");
|
||||||
var url = flag.String("url",
|
var url = flag.String("url",
|
||||||
"http://www.unicode.org/Public/5.1.0/ucd/",
|
"http://www.unicode.org/Public/5.1.0/ucd/",
|
||||||
@ -31,6 +39,9 @@ var tablelist = flag.String("tables",
|
|||||||
var scriptlist = flag.String("scripts",
|
var scriptlist = flag.String("scripts",
|
||||||
"all",
|
"all",
|
||||||
"comma-separated list of which script tables to generate");
|
"comma-separated list of which script tables to generate");
|
||||||
|
var cases = flag.Bool("cases",
|
||||||
|
true,
|
||||||
|
"generate case tables");
|
||||||
var test = flag.Bool("test",
|
var test = flag.Bool("test",
|
||||||
false,
|
false,
|
||||||
"test existing tables; can be used to compare web data with package data");
|
"test existing tables; can be used to compare web data with package data");
|
||||||
@ -44,7 +55,7 @@ var category = map[string] bool{ "letter":true } // Nd Lu etc. letter is a speci
|
|||||||
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
|
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
|
||||||
// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
|
// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
|
||||||
// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
|
// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
|
||||||
// The fields
|
// The fields:
|
||||||
const (
|
const (
|
||||||
FCodePoint = iota;
|
FCodePoint = iota;
|
||||||
FName;
|
FName;
|
||||||
@ -87,11 +98,11 @@ var fieldName = []string{
|
|||||||
// This contains only the properties we're interested in.
|
// This contains only the properties we're interested in.
|
||||||
type Char struct {
|
type Char struct {
|
||||||
field []string; // debugging only; could be deleted if we take out char.dump()
|
field []string; // debugging only; could be deleted if we take out char.dump()
|
||||||
codePoint uint32; // redundant (it's the index in the chars table) but useful
|
codePoint uint32; // if zero, this index is not a valid code point.
|
||||||
category string;
|
category string;
|
||||||
upperCase uint32;
|
upperCase int;
|
||||||
lowerCase uint32;
|
lowerCase int;
|
||||||
titleCase uint32;
|
titleCase int;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scripts.txt has form:
|
// Scripts.txt has form:
|
||||||
@ -104,26 +115,21 @@ type Script struct {
|
|||||||
script string;
|
script string;
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
var chars = make([]Char, MaxChar+1)
|
||||||
flag.Parse();
|
|
||||||
printCategories();
|
|
||||||
printScripts();
|
|
||||||
}
|
|
||||||
|
|
||||||
var chars = make([]Char, MaxChar)
|
|
||||||
var scripts = make(map[string] []Script)
|
var scripts = make(map[string] []Script)
|
||||||
|
|
||||||
var lastChar uint32 = 0;
|
var lastChar uint32 = 0;
|
||||||
|
|
||||||
// In UnicodeData.txt, some ranges are marked like this:
|
// In UnicodeData.txt, some ranges are marked like this:
|
||||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||||
// parseCategory returns a state variable indicating the weirdness.
|
// parseCategory returns a state variable indicating the weirdness.
|
||||||
type State int
|
type State int
|
||||||
const (
|
const (
|
||||||
SNormal State = iota; // known to be zero for the type
|
SNormal State = iota; // known to be zero for the type
|
||||||
SFirst;
|
SFirst;
|
||||||
SLast;
|
SLast;
|
||||||
|
SMissing;
|
||||||
)
|
)
|
||||||
|
|
||||||
func parseCategory(line string) (state State) {
|
func parseCategory(line string) (state State) {
|
||||||
@ -139,7 +145,7 @@ func parseCategory(line string) (state State) {
|
|||||||
if point == 0 {
|
if point == 0 {
|
||||||
return // not interesting and we use 0 as unset
|
return // not interesting and we use 0 as unset
|
||||||
}
|
}
|
||||||
if point >= MaxChar {
|
if point > MaxChar {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
char := &chars[point];
|
char := &chars[point];
|
||||||
@ -189,7 +195,7 @@ func (char *Char) letter(u, l, t string) {
|
|||||||
char.titleCase = char.letterValue(t, "T");
|
char.titleCase = char.letterValue(t, "T");
|
||||||
}
|
}
|
||||||
|
|
||||||
func (char *Char) letterValue(s string, cas string) uint32 {
|
func (char *Char) letterValue(s string, cas string) int {
|
||||||
if s == "" {
|
if s == "" {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
@ -198,7 +204,7 @@ func (char *Char) letterValue(s string, cas string) uint32 {
|
|||||||
char.dump(cas);
|
char.dump(cas);
|
||||||
die.Logf("U+%04x: bad letter(%s): %s", char.codePoint, s, err)
|
die.Logf("U+%04x: bad letter(%s): %s", char.codePoint, s, err)
|
||||||
}
|
}
|
||||||
return uint32(v)
|
return int(v)
|
||||||
}
|
}
|
||||||
|
|
||||||
func allCategories() []string {
|
func allCategories() []string {
|
||||||
@ -242,10 +248,7 @@ func letterOp(code int) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func printCategories() {
|
func loadChars() {
|
||||||
if *tablelist == "" {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if *dataUrl == "" {
|
if *dataUrl == "" {
|
||||||
flag.Set("data", *url + "UnicodeData.txt");
|
flag.Set("data", *url + "UnicodeData.txt");
|
||||||
}
|
}
|
||||||
@ -288,6 +291,12 @@ func printCategories() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
resp.Body.Close();
|
resp.Body.Close();
|
||||||
|
}
|
||||||
|
|
||||||
|
func printCategories() {
|
||||||
|
if *tablelist == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
// Find out which categories to dump
|
// Find out which categories to dump
|
||||||
list := strings.Split(*tablelist, ",", 0);
|
list := strings.Split(*tablelist, ",", 0);
|
||||||
if *tablelist == "all" {
|
if *tablelist == "all" {
|
||||||
@ -299,11 +308,11 @@ func printCategories() {
|
|||||||
}
|
}
|
||||||
fmt.Printf(
|
fmt.Printf(
|
||||||
"// Generated by running\n"
|
"// Generated by running\n"
|
||||||
"// maketables --tables=%s --url=%s\n"
|
"// maketables --tables=%s --data=%s\n"
|
||||||
"// DO NOT EDIT\n\n"
|
"// DO NOT EDIT\n\n"
|
||||||
"package unicode\n\n",
|
"package unicode\n\n",
|
||||||
*tablelist,
|
*tablelist,
|
||||||
*url
|
*dataUrl
|
||||||
);
|
);
|
||||||
|
|
||||||
fmt.Println("// Version is the Unicode edition from which the tables are derived.");
|
fmt.Println("// Version is the Unicode edition from which the tables are derived.");
|
||||||
@ -496,6 +505,9 @@ func parseScript(line string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func printScripts() {
|
func printScripts() {
|
||||||
|
if *scriptlist == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
var err os.Error;
|
var err os.Error;
|
||||||
scriptRe, err = regexp.Compile(`([0-9A-F]+)(\.\.[0-9A-F]+)? +; ([A-Za-z_]+)`);
|
scriptRe, err = regexp.Compile(`([0-9A-F]+)(\.\.[0-9A-F]+)? +; ([A-Za-z_]+)`);
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -604,3 +616,148 @@ func fullScriptTest(list []string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
CaseUpper = 1 << iota;
|
||||||
|
CaseLower;
|
||||||
|
CaseTitle;
|
||||||
|
CaseNone = 0; // must be zero
|
||||||
|
CaseMissing = -1; // character not present; not a valid case state
|
||||||
|
)
|
||||||
|
|
||||||
|
type caseState struct {
|
||||||
|
point int;
|
||||||
|
_case int;
|
||||||
|
deltaToUpper int;
|
||||||
|
deltaToLower int;
|
||||||
|
deltaToTitle int;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Is d a continuation of the state of c?
|
||||||
|
func (c *caseState) adjacent(d *caseState) bool {
|
||||||
|
if d.point < c.point {
|
||||||
|
return d.adjacent(c)
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case d.point != c.point+1:
|
||||||
|
return false
|
||||||
|
case d._case != c._case:
|
||||||
|
return false
|
||||||
|
case c._case == CaseNone:
|
||||||
|
return false
|
||||||
|
case c._case == CaseMissing:
|
||||||
|
return false
|
||||||
|
case d.deltaToUpper != c.deltaToUpper:
|
||||||
|
return false
|
||||||
|
case d.deltaToLower != c.deltaToLower:
|
||||||
|
return false
|
||||||
|
case d.deltaToTitle != c.deltaToTitle:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
func getCaseState(i int) (c *caseState) {
|
||||||
|
c = &caseState{ point: i, _case: CaseNone };
|
||||||
|
ch := &chars[i];
|
||||||
|
switch int(ch.codePoint) {
|
||||||
|
case 0:
|
||||||
|
c._case = CaseMissing; // Will get NUL wrong but that doesn't matter
|
||||||
|
return;
|
||||||
|
case ch.upperCase:
|
||||||
|
c._case = CaseUpper;
|
||||||
|
case ch.lowerCase:
|
||||||
|
c._case = CaseLower;
|
||||||
|
case ch.titleCase:
|
||||||
|
c._case = CaseTitle;
|
||||||
|
}
|
||||||
|
if ch.upperCase != 0 {
|
||||||
|
c.deltaToUpper = ch.upperCase - i
|
||||||
|
}
|
||||||
|
if ch.lowerCase != 0 {
|
||||||
|
c.deltaToLower = ch.lowerCase - i
|
||||||
|
}
|
||||||
|
if ch.titleCase != 0 {
|
||||||
|
c.deltaToTitle = ch.titleCase - i
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
func printCases() {
|
||||||
|
if !*cases {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if *test {
|
||||||
|
fullCaseTest();
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fmt.Printf(
|
||||||
|
"// Generated by running\n"
|
||||||
|
"// maketables --data=%s\n"
|
||||||
|
"// DO NOT EDIT\n\n"
|
||||||
|
"// CaseRanges is the table describing case mappings for all letters with\n"
|
||||||
|
"// non-self mappings.\n"
|
||||||
|
"var CaseRanges = _CaseRanges\n"
|
||||||
|
"var _CaseRanges = []CaseRange {\n",
|
||||||
|
*dataUrl
|
||||||
|
);
|
||||||
|
|
||||||
|
var startState *caseState; // the start of a run; nil for not active
|
||||||
|
var prevState = &caseState{}; // the state of the previous character
|
||||||
|
for i, c := range chars {
|
||||||
|
state := getCaseState(i);
|
||||||
|
if state.adjacent(prevState) {
|
||||||
|
prevState = state;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// end of run (possibly)
|
||||||
|
printCaseRange(startState, prevState);
|
||||||
|
startState = nil;
|
||||||
|
if state._case != CaseMissing && state._case != CaseNone {
|
||||||
|
startState = state;
|
||||||
|
}
|
||||||
|
prevState = state;
|
||||||
|
}
|
||||||
|
fmt.Printf("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
func printCaseRange(lo, hi *caseState) {
|
||||||
|
if lo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
|
||||||
|
// character represents itself in all cases - no need to mention it
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
|
||||||
|
lo.point, hi.point,
|
||||||
|
lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the cased value in the Char is 0, it means use the rune itself.
|
||||||
|
func caseIt(rune, cased int) int {
|
||||||
|
if cased == 0 {
|
||||||
|
return rune
|
||||||
|
}
|
||||||
|
return cased
|
||||||
|
}
|
||||||
|
|
||||||
|
func fullCaseTest() {
|
||||||
|
for i, c := range chars {
|
||||||
|
lower := unicode.ToLower(i);
|
||||||
|
want := caseIt(i, c.lowerCase);
|
||||||
|
if lower != want {
|
||||||
|
fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower);
|
||||||
|
}
|
||||||
|
upper := unicode.ToUpper(i);
|
||||||
|
want = caseIt(i, c.upperCase);
|
||||||
|
if upper != want {
|
||||||
|
fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper);
|
||||||
|
}
|
||||||
|
title := unicode.ToTitle(i);
|
||||||
|
want = caseIt(i, c.titleCase);
|
||||||
|
if title != want {
|
||||||
|
fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user