/* 4 space tabs filename: xt.h package: common utils/xml tokenizer description: xml routine headers for xt.c author: ydnar version history date user description ---------------------------------------------------------------------------- 02.11.2001 ydnar initial version for ia32 linux 02.13.2001 ydnar more work 02.15.2001 ydnar full rewrite 02.19.2001 ydnar it's coming together 02.20.2001 ydnar pulling an all-nighter 02.21.2001 ydnar pulling an all-nighter (again) changed to xt.h (from xu.h) as of today, it recognizes: + all start/empty/end tags w/attributes + comments + CDATA sections + character data todo: - char refs/entities in char data + attributes - processing instructions - entity definition (w3c == satan) - turn \r and \r\n into \n - proper error returns (on all sources) 02.22.2001 ydnar adding character references, entity references */ /* fingerprint */ #ifndef XT_H #define XT_H #endif /* includes */ #ifndef CT_H #include "ct.h" /* common types */ #endif #ifndef UU_H #include "uu.h" /* unicode utilities */ #endif /* types */ /* character range struct */ typedef struct XTCharRange XTCharRange; struct XTCharRange { UCS4Char min, max; }; /* xml token struct */ typedef struct XTToken XTToken; struct XTToken { UTF8String name; Flags flags; UInt32 len; /* 0 len = unlimited */ XTCharRange *cr1, *cr; /* first, rest character range */ UTF8String beginStr, endStr; UInt32 next[ 8 ]; /* directed graph */ }; /* xml token flags */ enum { XT_FLAGS_NULL = 0, XT_FLAGS_PRESERVE = 1, XT_FLAGS_NOTIFY_APP = 2, XT_FLAGS_INTERPOLATE_ENTITIES = 4 }; /* xml tokenizer struct */ typedef struct XTTokenizer XTTokenizer; struct XTTokenizer { UInt32 status, lastStatus, tokID; /* tokenizer status, current token id (in xtTokens[]) */ UInt32 beginLen, endLen; UInt32 dataAlloc, dataLen; UTF8String data; UInt32 charWaiting; UUReader r; /* character source */ }; /* xml tokenizer status codes */ enum { XT_STATUS_ERROR = -1, XT_STATUS_FOUND_TOKEN = 0, XT_STATUS_WORKING = 1, XT_STATUS_WANT_DATA = -100 }; /* xml token buffer size increments */ enum { XT_TOKEN_ALLOC = 4096 /* should be enough... */ }; /* xml base character sub-range definitions */ /* fake null sub-range */ #define XT_SR_NULL { 0x0, 0x0 } /* fake 'Empty' sub-range */ #define XT_SR_EMPTY { 0x0, 0x3B }, { 0x3E, 0xD7FF }, /* [^<] */ \ { 0xE000, 0xFFFD }, { 0x10000, 0x10FFFF } /* xml 'Char' sub-range */ #define XT_SR_CHAR { 0x9, 0x9 }, { 0xA, 0xA }, { 0xD, 0xD }, \ { 0x20, 0xD7FF }, \ { 0xE000, 0xFFFD }, { 0x10000, 0x10FFFF } /* xml 'CharData' sub-range */ #define XT_SR_CHAR_DATA { 0x9, 0x9 }, { 0xA, 0xA }, { 0xD, 0xD }, \ { 0x20, 0x25 }, { 0x27, 0x3B }, { 0x3E, 0xD7FF }, /* [^&<] */ \ { 0xE000, 0xFFFD }, { 0x10000, 0x10FFFF } /* xml 'AttValueQuot' sub-range */ #define XT_SR_ATT_VALUE_QUOT { 0x9, 0x9 }, { 0xA, 0xA }, { 0xD, 0xD }, \ { 0x20, 0x21 }, { 0x23, 0x25 }, { 0x27, 0x3B }, { 0x3E, 0xD7FF }, /* [^"&<] */ \ { 0xE000, 0xFFFD }, { 0x10000, 0x10FFFF } /* xml 'AttValueApos' sub-range */ #define XT_SR_ATT_VALUE_APOS { 0x9, 0x9 }, { 0xA, 0xA }, { 0xD, 0xD }, \ { 0x20, 0x25 }, { 0x28, 0x3B }, { 0x3E, 0xD7FF }, /* [^'&<] */ \ { 0xE000, 0xFFFD }, { 0x10000, 0x10FFFF } /* xml 'S' (whitespace) sub-range */ #define XT_SR_S { 0x9, 0x9 }, { 0xA, 0xA }, { 0xD, 0xD }, { 0x20, 0x20 } /* xml 'BaseChar' sub-range */ #define XT_SR_BASE_CHAR { 0x0041, 0x005A }, { 0x0061, 0x007A }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 }, \ { 0x00F8, 0x00FF }, { 0x0100, 0x0131 }, { 0x0134, 0x013E }, { 0x0141, 0x0148 }, \ { 0x014A, 0x017E }, { 0x0180, 0x01C3 }, { 0x01CD, 0x01F0 }, { 0x01F4, 0x01F5 }, \ { 0x01FA, 0x0217 }, { 0x0250, 0x02A8 }, { 0x02BB, 0x02C1 }, { 0x0386, 0x0386 }, \ { 0x0388, 0x038A }, { 0x038C, 0x038C }, { 0x038E, 0x03A1 }, { 0x03A3, 0x03CE }, \ { 0x03D0, 0x03D6 }, { 0x03DA, 0x03DA }, { 0x03DC, 0x03DC }, { 0x03DE, 0x03DE }, \ { 0x03E0, 0x03E0 }, { 0x03E2, 0x03F3 }, { 0x0401, 0x040C }, { 0x040E, 0x044F }, \ { 0x0451, 0x045C }, { 0x045E, 0x0481 }, { 0x0490, 0x04C4 }, { 0x04C7, 0x04C8 }, \ { 0x04CB, 0x04CC }, { 0x04D0, 0x04EB }, { 0x04EE, 0x04F5 }, { 0x04F8, 0x04F9 }, \ { 0x0531, 0x0556 }, { 0x0559, 0x0559 }, { 0x0561, 0x0586 }, { 0x05D0, 0x05EA }, \ { 0x05F0, 0x05F2 }, { 0x0621, 0x063A }, { 0x0641, 0x064A }, { 0x0671, 0x06B7 }, \ { 0x06BA, 0x06BE }, { 0x06C0, 0x06CE }, { 0x06D0, 0x06D3 }, { 0x06D5, 0x06D5 }, \ { 0x06E5, 0x06E6 }, { 0x0905, 0x0939 }, { 0x093D, 0x093D }, { 0x0958, 0x0961 }, \ { 0x0985, 0x098C }, { 0x098F, 0x0990 }, { 0x0993, 0x09A8 }, { 0x09AA, 0x09B0 }, \ { 0x09B2, 0x09B2 }, { 0x09B6, 0x09B9 }, { 0x09DC, 0x09DD }, { 0x09DF, 0x09E1 }, \ { 0x09F0, 0x09F1 }, { 0x0A05, 0x0A0A }, { 0x0A0F, 0x0A10 }, { 0x0A13, 0x0A28 }, \ { 0x0A2A, 0x0A30 }, { 0x0A32, 0x0A33 }, { 0x0A35, 0x0A36 }, { 0x0A38, 0x0A39 }, \ { 0x0A59, 0x0A5C }, { 0x0A5E, 0x0A5E }, { 0x0A72, 0x0A74 }, { 0x0A85, 0x0A8B }, \ { 0x0A8D, 0x0A8D }, { 0x0A8F, 0x0A91 }, { 0x0A93, 0x0AA8 }, { 0x0AAA, 0x0AB0 }, \ { 0x0AB2, 0x0AB3 }, { 0x0AB5, 0x0AB9 }, { 0x0ABD, 0x0ABD }, { 0x0AE0, 0x0AE0 }, \ { 0x0B05, 0x0B0C }, { 0x0B0F, 0x0B10 }, { 0x0B13, 0x0B28 }, { 0x0B2A, 0x0B30 }, \ { 0x0B32, 0x0B33 }, { 0x0B36, 0x0B39 }, { 0x0B3D, 0x0B3D }, { 0x0B5C, 0x0B5D }, \ { 0x0B5F, 0x0B61 }, { 0x0B85, 0x0B8A }, { 0x0B8E, 0x0B90 }, { 0x0B92, 0x0B95 }, \ { 0x0B99, 0x0B9A }, { 0x0B9C, 0x0B9C }, { 0x0B9E, 0x0B9F }, { 0x0BA3, 0x0BA4 }, \ { 0x0BA8, 0x0BAA }, { 0x0BAE, 0x0BB5 }, { 0x0BB7, 0x0BB9 }, { 0x0C05, 0x0C0C }, \ { 0x0C0E, 0x0C10 }, { 0x0C12, 0x0C28 }, { 0x0C2A, 0x0C33 }, { 0x0C35, 0x0C39 }, \ { 0x0C60, 0x0C61 }, { 0x0C85, 0x0C8C }, { 0x0C8E, 0x0C90 }, { 0x0C92, 0x0CA8 }, \ { 0x0CAA, 0x0CB3 }, { 0x0CB5, 0x0CB9 }, { 0x0CDE, 0x0CDE }, { 0x0CE0, 0x0CE1 }, \ { 0x0D05, 0x0D0C }, { 0x0D0E, 0x0D10 }, { 0x0D12, 0x0D28 }, { 0x0D2A, 0x0D39 }, \ { 0x0D60, 0x0D61 }, { 0x0E01, 0x0E2E }, { 0x0E30, 0x0E30 }, { 0x0E32, 0x0E33 }, \ { 0x0E40, 0x0E45 }, { 0x0E81, 0x0E82 }, { 0x0E84, 0x0E84 }, { 0x0E87, 0x0E88 }, \ { 0x0E8A, 0x0E8A }, { 0x0E8D, 0x0E8D }, { 0x0E94, 0x0E97 }, { 0x0E99, 0x0E9F }, \ { 0x0EA1, 0x0EA3 }, { 0x0EA5, 0x0EA5 }, { 0x0EA7, 0x0EA7 }, { 0x0EAA, 0x0EAB }, \ { 0x0EAD, 0x0EAE }, { 0x0EB0, 0x0EB0 }, { 0x0EB2, 0x0EB3 }, { 0x0EBD, 0x0EBD }, \ { 0x0EC0, 0x0EC4 }, { 0x0F40, 0x0F47 }, { 0x0F49, 0x0F69 }, { 0x10A0, 0x10C5 }, \ { 0x10D0, 0x10F6 }, { 0x1100, 0x1100 }, { 0x1102, 0x1103 }, { 0x1105, 0x1107 }, \ { 0x1109, 0x1109 }, { 0x110B, 0x110C }, { 0x110E, 0x1112 }, { 0x113C, 0x113C }, \ { 0x113E, 0x113E }, { 0x1140, 0x1140 }, { 0x114C, 0x114C }, { 0x114E, 0x114E }, \ { 0x1150, 0x1150 }, { 0x1154, 0x1155 }, { 0x1159, 0x1159 }, { 0x115F, 0x1161 }, \ { 0x1163, 0x1163 }, { 0x1165, 0x1165 }, { 0x1167, 0x1167 }, { 0x1169, 0x1169 }, \ { 0x116D, 0x116E }, { 0x1172, 0x1173 }, { 0x1175, 0x1175 }, { 0x119E, 0x119E }, \ { 0x11A8, 0x11A8 }, { 0x11AB, 0x11AB }, { 0x11AE, 0x11AF }, { 0x11B7, 0x11B8 }, \ { 0x11BA, 0x11BA }, { 0x11BC, 0x11C2 }, { 0x11EB, 0x11EB }, { 0x11F0, 0x11F0 }, \ { 0x11F9, 0x11F9 }, { 0x1E00, 0x1E9B }, { 0x1EA0, 0x1EF9 }, { 0x1F00, 0x1F15 }, \ { 0x1F18, 0x1F1D }, { 0x1F20, 0x1F45 }, { 0x1F48, 0x1F4D }, { 0x1F50, 0x1F57 }, \ { 0x1F59, 0x1F59 }, { 0x1F5B, 0x1F5B }, { 0x1F5D, 0x1F5D }, { 0x1F5F, 0x1F7D }, \ { 0x1F80, 0x1FB4 }, { 0x1FB6, 0x1FBC }, { 0x1FBE, 0x1FBE }, { 0x1FC2, 0x1FC4 }, \ { 0x1FC6, 0x1FCC }, { 0x1FD0, 0x1FD3 }, { 0x1FD6, 0x1FDB }, { 0x1FE0, 0x1FEC }, \ { 0x1FF2, 0x1FF4 }, { 0x1FF6, 0x1FFC }, { 0x2126, 0x2126 }, { 0x212A, 0x212B }, \ { 0x212E, 0x212E }, { 0x2180, 0x2182 }, { 0x3041, 0x3094 }, { 0x30A1, 0x30FA }, \ { 0x3105, 0x312C }, { 0xAC00, 0xD7A3 } /* xml 'Ideographic' sub-range */ #define XT_SR_IDEOGRAPHIC { 0x4E00, 0x9FA5 }, { 0x3007, 0x3007 }, { 0x3021, 0x3029 } /* xml 'CombiningChar' sub-range */ #define XT_SR_COMBINING_CHAR { 0x0300, 0x0345 }, { 0x0360, 0x0361 }, { 0x0483, 0x0486 }, { 0x0591, 0x05A1 }, \ { 0x05A3, 0x05B9 }, { 0x05BB, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, \ { 0x05C4, 0x05C4 }, { 0x064B, 0x0652 }, { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, \ { 0x06DD, 0x06DF }, { 0x06E0, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, \ { 0x0901, 0x0903 }, { 0x093C, 0x093C }, { 0x093E, 0x094C }, { 0x094D, 0x094D }, \ { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0983 }, { 0x09BC, 0x09BC }, \ { 0x09BE, 0x09BE }, { 0x09BF, 0x09BF }, { 0x09C0, 0x09C4 }, { 0x09C7, 0x09C8 }, \ { 0x09CB, 0x09CD }, { 0x09D7, 0x09D7 }, { 0x09E2, 0x09E3 }, { 0x0A02, 0x0A02 }, \ { 0x0A3C, 0x0A3C }, { 0x0A3E, 0x0A3E }, { 0x0A3F, 0x0A3F }, { 0x0A40, 0x0A42 }, \ { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A83 }, \ { 0x0ABC, 0x0ABC }, { 0x0ABE, 0x0AC5 }, { 0x0AC7, 0x0AC9 }, { 0x0ACB, 0x0ACD }, \ { 0x0B01, 0x0B03 }, { 0x0B3C, 0x0B3C }, { 0x0B3E, 0x0B43 }, { 0x0B47, 0x0B48 }, \ { 0x0B4B, 0x0B4D }, { 0x0B56, 0x0B57 }, { 0x0B82, 0x0B83 }, { 0x0BBE, 0x0BC2 }, \ { 0x0BC6, 0x0BC8 }, { 0x0BCA, 0x0BCD }, { 0x0BD7, 0x0BD7 }, { 0x0C01, 0x0C03 }, \ { 0x0C3E, 0x0C44 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, \ { 0x0C82, 0x0C83 }, { 0x0CBE, 0x0CC4 }, { 0x0CC6, 0x0CC8 }, { 0x0CCA, 0x0CCD }, \ { 0x0CD5, 0x0CD6 }, { 0x0D02, 0x0D03 }, { 0x0D3E, 0x0D43 }, { 0x0D46, 0x0D48 }, \ { 0x0D4A, 0x0D4D }, { 0x0D57, 0x0D57 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, \ { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC }, \ { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, \ { 0x0F39, 0x0F39 }, { 0x0F3E, 0x0F3E }, { 0x0F3F, 0x0F3F }, { 0x0F71, 0x0F84 }, \ { 0x0F86, 0x0F8B }, { 0x0F90, 0x0F95 }, { 0x0F97, 0x0F97 }, { 0x0F99, 0x0FAD }, \ { 0x0FB1, 0x0FB7 }, { 0x0FB9, 0x0FB9 }, { 0x20D0, 0x20DC }, { 0x20E1, 0x20E1 }, \ { 0x302A, 0x302F }, { 0x3099, 0x3099 }, { 0x309A, 0x309A } /* xml 'Digit' sub-range */ #define XT_SR_DIGIT { 0x0030, 0x0039 }, { 0x0660, 0x0669 }, { 0x06F0, 0x06F9 }, { 0x0966, 0x096F }, \ { 0x09E6, 0x09EF }, { 0x0A66, 0x0A6F }, { 0x0AE6, 0x0AEF }, { 0x0B66, 0x0B6F }, \ { 0x0BE7, 0x0BEF }, { 0x0C66, 0x0C6F }, { 0x0CE6, 0x0CEF }, { 0x0D66, 0x0D6F }, \ { 0x0E50, 0x0E59 }, { 0x0ED0, 0x0ED9 }, { 0x0F20, 0x0F29 } /* xml 'Extender' sub-range */ #define XT_SR_EXTENDER { 0x00B7, 0x00B7 }, { 0x02D0, 0x02D0 }, { 0x02D1, 0x02D1 }, { 0x0387, 0x0387 }, \ { 0x0640, 0x0640 }, { 0x0E46, 0x0E46 }, { 0x0EC6, 0x0EC6 }, { 0x3005, 0x3005 }, \ { 0x3031, 0x3035 }, { 0x309D, 0x309E }, { 0x30FC, 0x30FE } /* xml 'Letter' sub-range */ #define XT_SR_LETTER XT_SR_BASE_CHAR, XT_SR_IDEOGRAPHIC /* xml character ranges */ /* fake 'Empty' range */ #ifndef XT_C extern XTCharRange xtCREmpty[]; #else XTCharRange xtCREmpty[] = { XT_SR_EMPTY, XT_SR_NULL }; #endif /* xml 'Char' range */ #ifndef XT_C extern XTCharRange xtCRChar[]; #else XTCharRange xtCRChar[] = { XT_SR_CHAR, XT_SR_NULL }; #endif /* xml 'CharData' range */ #ifndef XT_C extern XTCharRange xtCRCharData[]; #else XTCharRange xtCRCharData[] = { XT_SR_CHAR_DATA, XT_SR_NULL }; #endif /* xml 'AttValueQuot' range */ #ifndef XT_C extern XTCharRange xtCRAttValueQuot[]; #else XTCharRange xtCRAttValueQuot[] = { XT_SR_ATT_VALUE_QUOT, XT_SR_NULL }; #endif /* xml 'AttValueApos' range */ #ifndef XT_C extern XTCharRange xtCRAttValueApos[]; #else XTCharRange xtCRAttValueApos[] = { XT_SR_ATT_VALUE_APOS, XT_SR_NULL }; #endif /* xml 'S' (whitespace) range */ #ifndef XT_C extern XTCharRange xtCRS[]; #else XTCharRange xtCRS[] = { XT_SR_S, XT_SR_NULL }; #endif /* xml 'BaseChar' range */ #ifndef XT_C extern XTCharRange xtCRBaseChar[]; #else XTCharRange xtCRBaseChar[] = { XT_SR_BASE_CHAR, XT_SR_NULL }; #endif /* xml 'Ideographic' range */ #ifndef XT_C extern XTCharRange xtCRIdeographic[]; #else XTCharRange xtCRIdeographic[] = { XT_SR_IDEOGRAPHIC, XT_SR_NULL }; #endif /* xml 'CombiningChar' range */ #ifndef XT_C extern XTCharRange xtCRCombiningChar[]; #else XTCharRange xtCRCombiningChar[] = { XT_SR_COMBINING_CHAR, XT_SR_NULL }; #endif /* xml 'Digit' range */ #ifndef XT_C extern XTCharRange xtCRDigit[]; #else XTCharRange xtCRDigit[] = { XT_SR_DIGIT, XT_SR_NULL }; #endif /* xml 'Extender' range */ #ifndef XT_C extern XTCharRange xtCRExtender[]; #else XTCharRange xtCRExtender[] = { XT_SR_EXTENDER, XT_SR_NULL }; #endif /* xml 'Letter' range */ #ifndef XT_C extern XTCharRange xtCRLetter[]; #else XTCharRange xtCRLetter[] = { XT_SR_LETTER, XT_SR_NULL }; #endif /* xml 'NameChar' range */ #ifndef XT_C extern XTCharRange xtCRNameChar[]; #else XTCharRange xtCRNameChar[] = { { '.', '.' }, { '-', '-' }, { '_', '_' }, { ':', ':' }, XT_SR_LETTER, XT_SR_DIGIT, XT_SR_NULL }; #endif /* xml 'NameFirst' (first char in a 'Name') range */ #ifndef XT_C extern XTCharRange xtCRNameFirst[]; #else XTCharRange xtCRNameFirst[] = { { '_', '_' }, { ':', ':' }, XT_SR_LETTER, XT_SR_NULL }; #endif /* xml 'Amp' (helper) range */ #ifndef XT_C extern XTCharRange xtCRAmp[]; #else XTCharRange xtCRAmp[] = { { '&', '&' }, XT_SR_NULL }; #endif /* xml 'Hash' (helper) range */ #ifndef XT_C extern XTCharRange xtCRHash[]; #else XTCharRange xtCRHash[] = { { '#', '#' }, XT_SR_NULL }; #endif /* xml 'X' (helper) range */ #ifndef XT_C extern XTCharRange xtCRX[]; #else XTCharRange xtCRX[] = { { 'x', 'x' }, XT_SR_NULL }; #endif /* xml 'DecNumber' (helper) range */ #ifndef XT_C extern XTCharRange xtCRDecNumber[]; #else XTCharRange xtCRDecNumber[] = { { '0', '9' }, XT_SR_NULL }; #endif /* xml 'HexNumber' (helper) range */ #ifndef XT_C extern XTCharRange xtCRHexNumber[]; #else XTCharRange xtCRHexNumber[] = { { '0', '9' }, { 'a', 'f' }, { 'A', 'F' }, XT_SR_NULL }; #endif /* xml 'Semi' (helper) range */ #ifndef XT_C extern XTCharRange xtCRSemi[]; #else XTCharRange xtCRSemi[] = { { ';', ';' }, XT_SR_NULL }; #endif /* xml 'LessThan' (helper) range */ #ifndef XT_C extern XTCharRange xtCRLessThan[]; #else XTCharRange xtCRLessThan[] = { { '<', '<' }, XT_SR_NULL }; #endif /* xml 'GreaterThan' (helper) range */ #ifndef XT_C extern XTCharRange xtCRGreaterThan[]; #else XTCharRange xtCRGreaterThan[] = { { '>', '>' }, XT_SR_NULL }; #endif /* xml 'Excl' (exclaimation point) (helper) range */ #ifndef XT_C extern XTCharRange xtCRExcl[]; #else XTCharRange xtCRExcl[] = { { '!', '!' }, XT_SR_NULL }; #endif /* xml 'LeftBracket' (helper) range */ #ifndef XT_C extern XTCharRange xtCRLeftBracket[]; #else XTCharRange xtCRLeftBracket[] = { { '[', '[' }, XT_SR_NULL }; #endif /* xml 'Dash' (helper) range */ #ifndef XT_C extern XTCharRange xtCRDash[]; #else XTCharRange xtCRDash[] = { { '-', '-' }, XT_SR_NULL }; #endif /* xml 'Slash' (helper) range */ #ifndef XT_C extern XTCharRange xtCRSlash[]; #else XTCharRange xtCRSlash[] = { { '/', '/' }, XT_SR_NULL }; #endif /* xml 'Eq' (helper) range */ #ifndef XT_C extern XTCharRange xtCREq[]; #else XTCharRange xtCREq[] = { { '=', '=' }, XT_SR_NULL }; #endif /* xml 'Quot' (helper) range */ #ifndef XT_C extern XTCharRange xtCRQuot[]; #else XTCharRange xtCRQuot[] = { { '"', '"' }, XT_SR_NULL }; #endif /* xml 'Apos' (helper) range */ #ifndef XT_C extern XTCharRange xtCRApos[]; #else XTCharRange xtCRApos[] = { { '\'', '\'' }, XT_SR_NULL }; #endif /* xml sub-token definitions */ /* fake 'Empty' sub-token */ #define XT_ST_EMPTY XT_FLAGS_NULL, 0, \ xtCREmpty, xtCREmpty, \ NULL, NULL /* xml 'CharData' sub-token */ #define XT_ST_CHAR_DATA XT_FLAGS_NULL, 0, \ xtCRCharData, xtCRCharData, \ NULL, NULL /* xml 'AttValueQuot' sub-token */ #define XT_ST_ATT_VALUE_QUOT XT_FLAGS_NULL, 0, \ xtCRAttValueQuot, xtCRAttValueQuot, \ NULL, NULL /* xml 'AttValueApos' sub-token */ #define XT_ST_ATT_VALUE_APOS XT_FLAGS_NULL, 0, \ xtCRAttValueApos, xtCRAttValueApos, \ NULL, NULL /* xml 'S' (whitespace) sub-token */ #define XT_ST_S XT_FLAGS_NULL, 0, \ xtCRS, xtCRS, \ NULL, NULL /* xml 'Name' sub-token */ #define XT_ST_NAME XT_FLAGS_NULL, 0, \ xtCRNameFirst, xtCRNameChar, \ NULL, NULL /* xml 'Amp' (&) sub-token */ #define XT_ST_AMP XT_FLAGS_NULL, 1, \ xtCRAmp, xtCRAmp, \ NULL, NULL /* xml 'Hash' (#) sub-token */ #define XT_ST_HASH XT_FLAGS_NULL, 1, \ xtCRHash, xtCRHash, \ NULL, NULL /* xml 'DecRef' (decimal character reference) sub-token */ #define XT_ST_DEC_REF XT_FLAGS_NULL, 0, \ xtCRDecNumber, xtCRDecNumber, \ NULL, NULL /* xml 'StartHexRef' (decimal character reference) sub-token */ #define XT_ST_START_HEX_REF XT_FLAGS_NULL, 1, \ xtCRX, xtCRX, \ NULL, NULL /* xml 'HexRef' (decimal character reference) sub-token */ #define XT_ST_HEX_REF XT_FLAGS_NULL, 0, \ xtCRHexNumber, xtCRHexNumber, \ NULL, NULL /* xml 'Semi' (;) sub-token */ #define XT_ST_SEMI XT_FLAGS_NULL, 0, \ xtCRSemi, xtCRSemi, \ NULL, NULL /* xml 'LessThan' (<) sub-token */ #define XT_ST_LESS_THAN XT_FLAGS_NULL, 1, \ xtCRLessThan, xtCRLessThan, \ NULL, NULL /* xml 'GreaterThan' (>) sub-token */ #define XT_ST_GREATER_THAN XT_FLAGS_NULL, 1, \ xtCRGreaterThan, xtCRGreaterThan, \ NULL, NULL /* xml 'Slash' (/) sub-token */ #define XT_ST_SLASH XT_FLAGS_NULL, 1, \ xtCRSlash, xtCRSlash, \ NULL, NULL /* xml 'Eq' (=) sub-token */ #define XT_ST_EQ XT_FLAGS_NULL, 1, \ xtCREq, xtCREq, \ NULL, NULL /* xml 'Quot' (") sub-token */ #define XT_ST_QUOT XT_FLAGS_NULL, 1, \ xtCRQuot, xtCRQuot, \ NULL, NULL /* xml 'Apos' (') sub-token */ #define XT_ST_APOS XT_FLAGS_NULL, 1, \ xtCRApos, xtCRApos, \ NULL, NULL /* xml 'Excl' (!) sub-token */ #define XT_ST_EXCL XT_FLAGS_NULL, 1, \ xtCRExcl, xtCRExcl, \ NULL, NULL /* xml 'Comment' (helper) sub-token */ #define XT_ST_COMMENT XT_FLAGS_NULL, 0, \ xtCRDash, xtCRChar, \ "--", "--" /* xml 'CDATA' (helper) sub-token */ #define XT_ST_CDATA XT_FLAGS_NULL, 0, \ xtCRLeftBracket, xtCRChar, \ "[CDATA[", "]]>" /* TODO - implement the rest of the start tag tokens - implement CharRef and EntityRef in CharData and Attributes - implement CharRef and EntityRef interpolation - implement comments - implement CDATA */ /* token types */ enum { XT_TOKEN_EMPTY, /* token order _must_ reconcile with the following ordered array! */ XT_TOKEN_CHAR_DATA, XT_TOKEN_CHAR_DATA_AMP, XT_TOKEN_CHAR_DATA_HASH, XT_TOKEN_CHAR_DATA_DEC_REF, XT_TOKEN_CHAR_DATA_START_HEX_REF, XT_TOKEN_CHAR_DATA_HEX_REF, XT_TOKEN_CHAR_DATA_SEMI, XT_TOKEN_START_MARKUP, XT_TOKEN_END_MARKUP, XT_TOKEN_START_TAG_NAME, XT_TOKEN_START_TAG_S, XT_TOKEN_START_TAG_ATT_NAME, XT_TOKEN_START_TAG_ATT_PRE_EQ_S, XT_TOKEN_START_TAG_ATT_EQ, XT_TOKEN_START_TAG_ATT_POST_EQ_S, XT_TOKEN_START_TAG_ATT_OPEN_QUOT, XT_TOKEN_START_TAG_ATT_VALUE_QUOT, XT_TOKEN_START_TAG_ATT_CLOSE_QUOT, XT_TOKEN_START_TAG_ATT_OPEN_APOS, XT_TOKEN_START_TAG_ATT_VALUE_APOS, XT_TOKEN_START_TAG_ATT_CLOSE_APOS, XT_TOKEN_EMPTY_TAG_SLASH, XT_TOKEN_END_TAG_SLASH, XT_TOKEN_END_TAG_NAME, XT_TOKEN_END_TAG_S, XT_TOKEN_START_MARKUP_EXCL, XT_TOKEN_COMMENT, XT_TOKEN_CDATA, XT_TOKEN_COUNT }; enum { XT_TOKEN_NONE = -1 }; /* the tokens */ #ifndef XT_C extern XTToken xtTokens[ XT_TOKEN_COUNT ]; #else XTToken xtTokens[ XT_TOKEN_COUNT ] = { /* token format: [token name (should match up with above id)], [sub-token definition], { [next token 1], [next token 2], ... [next token 7], XT_TOKEN_NONE } */ /* Empty */ { "XT_TOKEN_EMPTY", XT_ST_EMPTY, { XT_TOKEN_START_MARKUP, XT_TOKEN_NONE } }, /* CharData */ { "XT_TOKEN_CHAR_DATA", XT_ST_CHAR_DATA, { XT_TOKEN_START_MARKUP, XT_TOKEN_CHAR_DATA_AMP, XT_TOKEN_NONE } }, { "XT_TOKEN_CHAR_DATA_AMP", XT_ST_AMP, { XT_TOKEN_CHAR_DATA_HASH, XT_TOKEN_NONE } }, { "XT_TOKEN_CHAR_DATA_HASH", XT_ST_HASH, { XT_TOKEN_CHAR_DATA_DEC_REF, XT_TOKEN_CHAR_DATA_START_HEX_REF, XT_TOKEN_NONE } }, { "XT_TOKEN_CHAR_DATA_DEC_REF", XT_ST_DEC_REF, { XT_TOKEN_CHAR_DATA_SEMI, XT_TOKEN_NONE } }, { "XT_TOKEN_CHAR_DATA_START_HEX_REF", XT_ST_START_HEX_REF, { XT_TOKEN_CHAR_DATA_HEX_REF, XT_TOKEN_NONE } }, { "XT_TOKEN_CHAR_DATA_HEX_REF", XT_ST_HEX_REF, { XT_TOKEN_CHAR_DATA_SEMI, XT_TOKEN_NONE } }, { "XT_TOKEN_CHAR_DATA_SEMI", XT_ST_SEMI, { XT_TOKEN_CHAR_DATA, XT_TOKEN_CHAR_DATA_AMP, XT_TOKEN_START_MARKUP, XT_TOKEN_NONE } }, /* Markup */ { "XT_TOKEN_START_MARKUP", XT_ST_LESS_THAN, { XT_TOKEN_START_TAG_NAME, XT_TOKEN_END_TAG_SLASH, XT_TOKEN_START_MARKUP_EXCL, XT_TOKEN_NONE } }, { "XT_TOKEN_END_MARKUP", XT_ST_GREATER_THAN, { XT_TOKEN_START_MARKUP, XT_TOKEN_CHAR_DATA, XT_TOKEN_CHAR_DATA_AMP, XT_TOKEN_NONE } }, /* StartTag, EmptyTag */ { "XT_TOKEN_START_TAG_NAME", XT_ST_NAME, { XT_TOKEN_END_MARKUP, XT_TOKEN_START_TAG_S, XT_TOKEN_EMPTY_TAG_SLASH, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_S", XT_ST_S, { XT_TOKEN_END_MARKUP, XT_TOKEN_START_TAG_ATT_NAME, XT_TOKEN_EMPTY_TAG_SLASH, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_NAME", XT_ST_NAME, { XT_TOKEN_START_TAG_ATT_EQ, XT_TOKEN_START_TAG_ATT_PRE_EQ_S, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_PRE_EQ_S", XT_ST_S, { XT_TOKEN_START_TAG_ATT_EQ, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_EQ", XT_ST_EQ, { XT_TOKEN_START_TAG_ATT_OPEN_QUOT, XT_TOKEN_START_TAG_ATT_OPEN_APOS, XT_TOKEN_START_TAG_ATT_POST_EQ_S, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_POST_EQ_S", XT_ST_S, { XT_TOKEN_START_TAG_ATT_OPEN_QUOT, XT_TOKEN_START_TAG_ATT_OPEN_APOS, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_OPEN_QUOT", XT_ST_QUOT, { XT_TOKEN_START_TAG_ATT_VALUE_QUOT, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_VALUE_QUOT", XT_ST_ATT_VALUE_QUOT, { XT_TOKEN_START_TAG_ATT_CLOSE_QUOT, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_CLOSE_QUOT", XT_ST_QUOT, { XT_TOKEN_END_MARKUP, XT_TOKEN_START_TAG_S, XT_TOKEN_START_TAG_ATT_NAME, XT_TOKEN_EMPTY_TAG_SLASH, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_OPEN_APOS", XT_ST_APOS, { XT_TOKEN_START_TAG_ATT_VALUE_APOS, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_VALUE_APOS", XT_ST_ATT_VALUE_APOS, { XT_TOKEN_START_TAG_ATT_CLOSE_APOS, XT_TOKEN_NONE } }, { "XT_TOKEN_START_TAG_ATT_CLOSE_APOS", XT_ST_APOS, { XT_TOKEN_END_MARKUP, XT_TOKEN_START_TAG_S, XT_TOKEN_START_TAG_ATT_NAME, XT_TOKEN_EMPTY_TAG_SLASH, XT_TOKEN_NONE } }, /* EmptyTag */ { "XT_TOKEN_EMPTY_TAG_SLASH", XT_ST_SLASH, { XT_TOKEN_END_MARKUP, XT_TOKEN_NONE } }, /* EndTag */ { "XT_TOKEN_END_TAG_SLASH", XT_ST_SLASH, { XT_TOKEN_END_TAG_NAME, XT_TOKEN_NONE } }, { "XT_TOKEN_END_TAG_NAME", XT_ST_NAME, { XT_TOKEN_END_MARKUP, XT_TOKEN_END_TAG_S, XT_TOKEN_NONE } }, { "XT_TOKEN_END_TAG_S", XT_ST_S, { XT_TOKEN_END_MARKUP, XT_TOKEN_NONE } }, /* Comment, CDATA, Entity Hell (FIXME: add the rest of this shit) */ { "XT_TOKEN_START_MARKUP_EXCL", XT_ST_EXCL, { XT_TOKEN_COMMENT, XT_TOKEN_CDATA, XT_TOKEN_NONE } }, /* Comment */ { "XT_TOKEN_COMMENT", XT_ST_COMMENT, { XT_TOKEN_END_MARKUP, XT_TOKEN_NONE } }, /* CDATA */ { "XT_TOKEN_CDATA", XT_ST_CDATA, { XT_TOKEN_START_MARKUP, XT_TOKEN_CHAR_DATA, XT_TOKEN_CHAR_DATA_AMP, XT_TOKEN_NONE } } }; #endif /* public function prototypes */ Error xtOpenTokenizer( XTTokenizer *t ); Error xtCloseTokenizer( XTTokenizer *t ); Error xtFillTokenizer( XTTokenizer *t, UInt32 bufLen, Char *buf ); UInt32 xtInCharRange( UCS4Char uc, XTCharRange *range ); Error xtReadNextToken( XTTokenizer *t );