Re: convert unicode text to code point
Re: convert unicode text to code point
- Subject: Re: convert unicode text to code point
- From: "Nigel Garvey" <email@hidden>
- Date: Mon, 16 Jan 2006 18:27:11 +0000
bill wrote on Mon, 16 Jan 2006 06:47:16 +0800:
>Hello,
>
>Last week I re-read an old thread "Re: Producing Unicode-only
>characters" and found that Niger suggested an exploitation of the
>single-record-list-to-string coercion bug:
[which involves]
>set gText to {{a:uText}} as string
Ooh, now. I'd hate anyone to think I'd actually _suggested_ that, though
I did indeed mention it -- ah -- in passing for completeness. It's even
less safe to rely on bugged behaviour than it is to rely on correct
behaviour, so in general one shouldn't do it.
>-- handler starts
>-- for handler codePoint from utxt
>property hexcode : {"00", "01", "02", "03", "04", "05", "06", "07",
>"08", "09", "0A", "0B", "0C", "0D", "0E", "0F", "10", "11", "12",
>"13", "14", "15", "16", "17", "18", "19", "1A", "1B", "1C", "1D",
>"1E", "1F", "20", "21", "22", "23", "24", "25", "26", "27", "28",
>"29", "2A", "2B", "2C", "2D", "2E", "2F", "30", "31", "32", "33",
>"34", "35", "36", "37", "38", "39", "3A", "3B", "3C", "3D", "3E",
>"3F", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
>"4A", "4B", "4C", "4D", "4E", "4F", "50", "51", "52", "53", "54",
>"55", "56", "57", "58", "59", "5A", "5B", "5C", "5D", "5E", "5F",
>"60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "6A",
>"6B", "6C", "6D", "6E", "6F", "70", "71", "72", "73", "74", "75",
>"76", "77", "78", "79", "7A", "7B", "7C", "7D", "7E", "7F", "80",
>"81", "82", "83", "84", "85", "86", "87", "88", "89", "8A", "8B",
>"8C", "8D", "8E", "8F", "90", "91", "92", "93", "94", "95", "96",
>"97", "98", "99", "9A", "9B", "9C", "9D", "9E", "9F", "A0", "A1",
>"A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "AA", "AB", "AC",
>"AD", "AE", "AF", "B0", "B1", "B2", "B3", "B4", "B5", "B6", "B7",
>"B8", "B9", "BA", "BB", "BC", "BD", "BE", "BF", "C0", "C1", "C2",
>"C3", "C4", "C5", "C6", "C7", "C8", "C9", "CA", "CB", "CC", "CD",
>"CE", "CF", "D0", "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8",
>"D9", "DA", "DB", "DC", "DD", "DE", "DF", "E0", "E1", "E2", "E3",
>"E4", "E5", "E6", "E7", "E8", "E9", "EA", "EB", "EC", "ED", "EE",
>"EF", "F0", "F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9",
>"FA", "FB", "FC", "FD", "FE", "FF"}
>
>on codePoint from unicodeText
> -- purpose: find the code point from unicode text
> -- input: unicodeText: unicode text
> -- output: unicode text
> -- usage: codePoint of me from Çdata utxt60B20000D87EDDF4È as
>Unicode text
> -- "60B20000D87EDDF4"
> set uCodePoint to "" as Unicode text
> set garbledList to items 63 thru -1 of ({{a:unicodeText}} as text)
> repeat with i from 1 to (count of garbledList)
> set uCodePoint to uCodePoint & my hexcode's item ((ASCII number
>(item i of garbledList)) + 1)
> end repeat
>end codePoint
>-- handler ends
>
>
>Well, I tried to use repeat loop to simulate the function of ASCII
>number, however, the speed is very slow. Hope someone on the list can
>improve this one, and make it a vanilla method :)
It's mainly the repeated 'ASCII number' calls that are slowing down the
script. You might as well save the Unicode text to file and read it back
as small integers. Then you can use fast maths to reference the hex list.
-- handler starts
-- for handler codePoint from utxt
property hexcode : {"00", "01", "02", "03", "04", "05", "06", "07",
"08", "09", "0A", "0B", "0C", "0D", "0E", "0F", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "1A", "1B", "1C", "1D", "1E", "1F",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "2A", "2B",
"2C", "2D", "2E", "2F", "30", "31", "32", "33", "34", "35", "36", "37",
"38", "39", "3A", "3B", "3C", "3D", "3E", "3F", "40", "41", "42", "43",
"44", "45", "46", "47", "48", "49", "4A", "4B", "4C", "4D", "4E", "4F",
"50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "5A", "5B",
"5C", "5D", "5E", "5F", "60", "61", "62", "63", "64", "65", "66", "67",
"68", "69", "6A", "6B", "6C", "6D", "6E", "6F", "70", "71", "72", "73",
"74", "75", "76", "77", "78", "79", "7A", "7B", "7C", "7D", "7E", "7F",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "8A", "8B",
"8C", "8D", "8E", "8F", "90", "91", "92", "93", "94", "95", "96", "97",
"98", "99", "9A", "9B", "9C", "9D", "9E", "9F", "A0", "A1", "A2", "A3",
"A4", "A5", "A6", "A7", "A8", "A9", "AA", "AB", "AC", "AD", "AE", "AF",
"B0", "B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B9", "BA", "BB",
"BC", "BD", "BE", "BF", "C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7",
"C8", "C9", "CA", "CB", "CC", "CD", "CE", "CF", "D0", "D1", "D2", "D3",
"D4", "D5", "D6", "D7", "D8", "D9", "DA", "DB", "DC", "DD", "DE", "DF",
"E0", "E1", "E2", "E3", "E4", "E5", "E6", "E7", "E8", "E9", "EA", "EB",
"EC", "ED", "EE", "EF", "F0", "F1", "F2", "F3", "F4", "F5", "F6", "F7",
"F8", "F9", "FA", "FB", "FC", "FD", "FE", "FF"}
on codePoint from unicodeText
set fref to (open for access file ((path to temporary items as
Unicode text) & "utxt scratch.txt") with write permission)
try
set eof fref to 0
write unicodeText to fref
set smallInts to (read fref from 1 as small integer) as list
end try
close access fref
set uCodePoint to "" as Unicode text
repeat with i from 1 to (count smallInts)
-- Convert this small integer to a positive normal one.
set n to (65536 + (item i of smallInts)) mod 65536
-- Use it to make two accesses to 'hexcode'.
set uCodePoint to uCodePoint & my hexcode's item (n div 256 + 1) &
my hexcode's item (n mod 256 + 1)
end repeat
end codePoint
-- handler ends
codePoint of me from «data utxt60B20000D87EDDF4» as Unicode text
--> "60B20000D87EDDF4"
NG
_______________________________________________
Do not post admin requests to the list. They will be ignored.
Applescript-users mailing list (email@hidden)
Help/Unsubscribe/Update your Subscription:
This email sent to email@hidden