xref: /dragonfly/sys/vfs/udf/osta.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
1 /*
2  * Various routines from the OSTA 2.01 specs.  Copyrights are included with
3  * each code segment.  Slight whitespace modifications have been made for
4  * formatting purposes.  Typos/bugs have been fixed.
5  *
6  * $FreeBSD: src/sys/fs/udf/osta.c,v 1.3 2003/11/05 06:55:23 scottl Exp $
7  * $DragonFly: src/sys/vfs/udf/osta.c,v 1.3 2006/12/23 00:41:30 swildner Exp $
8  */
9 
10 #include <vfs/udf/osta.h>
11 
12 /*****************************************************************************/
13 /***********************************************************************
14  * OSTA compliant Unicode compression, uncompression routines.
15  * Copyright 1995 Micro Design International, Inc.
16  * Written by Jason M. Rinn.
17  * Micro Design International gives permission for the free use of the
18  * following source code.
19  */
20 
21 /***********************************************************************
22  * Takes an OSTA CS0 compressed unicode name, and converts
23  * it to Unicode.
24  * The Unicode output will be in the byte order
25  * that the local compiler uses for 16-bit values.
26  * NOTE: This routine only performs error checking on the compID.
27  * It is up to the user to ensure that the unicode buffer is large
28  * enough, and that the compressed unicode name is correct.
29  *
30  * RETURN VALUE
31  *
32  * The number of unicode characters which were uncompressed.
33  * A -1 is returned if the compression ID is invalid.
34  */
35 int
udf_UncompressUnicode(int numberOfBytes,byte * UDFCompressed,unicode_t * unicode)36 udf_UncompressUnicode(
37           int numberOfBytes,  /* (Input) number of bytes read from media. */
38           byte *UDFCompressed,          /* (Input) bytes read from media. */
39           unicode_t *unicode) /* (Output) uncompressed unicode characters. */
40 {
41           unsigned int compID;
42           int returnValue, unicodeIndex, byteIndex;
43 
44           /* Use UDFCompressed to store current byte being read. */
45           compID = UDFCompressed[0];
46 
47           /* First check for valid compID. */
48           if (compID != 8 && compID != 16) {
49                     returnValue = -1;
50           } else {
51                     unicodeIndex = 0;
52                     byteIndex = 1;
53 
54                     /* Loop through all the bytes. */
55                     while (byteIndex < numberOfBytes) {
56                               if (compID == 16) {
57                                         /* Move the first byte to the high bits of the
58                                          * unicode char.
59                                          */
60                                         unicode[unicodeIndex] =
61                                             UDFCompressed[byteIndex++] << 8;
62                               } else {
63                                         unicode[unicodeIndex] = 0;
64                               }
65                               if (byteIndex < numberOfBytes) {
66                                         /*Then the next byte to the low bits. */
67                                         unicode[unicodeIndex] |=
68                                             UDFCompressed[byteIndex++];
69                               }
70                               unicodeIndex++;
71                     }
72                     returnValue = unicodeIndex;
73           }
74           return(returnValue);
75 }
76 
77 /*
78  * Almost same as udf_UncompressUnicode(). The difference is that
79  * it keeps byte order of unicode string.
80  */
81 int
udf_UncompressUnicodeByte(int numberOfBytes,byte * UDFCompressed,byte * unicode)82 udf_UncompressUnicodeByte(
83           int numberOfBytes,  /* (Input) number of bytes read from media. */
84           byte *UDFCompressed,          /* (Input) bytes read from media. */
85           byte *unicode)                /* (Output) uncompressed unicode characters. */
86 {
87           unsigned int compID;
88           int returnValue, unicodeIndex, byteIndex;
89 
90           /* Use UDFCompressed to store current byte being read. */
91           compID = UDFCompressed[0];
92 
93           /* First check for valid compID. */
94           if (compID != 8 && compID != 16) {
95                     returnValue = -1;
96           } else {
97                     unicodeIndex = 0;
98                     byteIndex = 1;
99 
100                     /* Loop through all the bytes. */
101                     while (byteIndex < numberOfBytes) {
102                               if (compID == 16) {
103                                         /* Move the first byte to the high bits of the
104                                          * unicode char.
105                                          */
106                                         unicode[unicodeIndex++] =
107                                             UDFCompressed[byteIndex++];
108                               } else {
109                                         unicode[unicodeIndex++] = 0;
110                               }
111                               if (byteIndex < numberOfBytes) {
112                                         /*Then the next byte to the low bits. */
113                                         unicode[unicodeIndex++] =
114                                             UDFCompressed[byteIndex++];
115                               }
116                     }
117                     returnValue = unicodeIndex;
118           }
119           return(returnValue);
120 }
121 
122 /***********************************************************************
123  * DESCRIPTION:
124  * Takes a string of unicode wide characters and returns an OSTA CS0
125  * compressed unicode string. The unicode MUST be in the byte order of
126  * the compiler in order to obtain correct results. Returns an error
127  * if the compression ID is invalid.
128  *
129  * NOTE: This routine assumes the implementation already knows, by
130  * the local environment, how many bits are appropriate and
131  * therefore does no checking to test if the input characters fit
132  * into that number of bits or not.
133  *
134  * RETURN VALUE
135  *
136  * The total number of bytes in the compressed OSTA CS0 string,
137  * including the compression ID.
138  * A -1 is returned if the compression ID is invalid.
139  */
140 int
udf_CompressUnicode(int numberOfChars,int compID,unicode_t * unicode,byte * UDFCompressed)141 udf_CompressUnicode(
142           int numberOfChars,  /* (Input) number of unicode characters. */
143           int compID,                   /* (Input) compression ID to be used. */
144           unicode_t *unicode, /* (Input) unicode characters to compress. */
145           byte *UDFCompressed)          /* (Output) compressed string, as bytes. */
146 {
147           int byteIndex, unicodeIndex;
148 
149           if (compID != 8 && compID != 16) {
150                     byteIndex = -1; /* Unsupported compression ID ! */
151           } else {
152                     /* Place compression code in first byte. */
153                     UDFCompressed[0] = compID;
154 
155                     byteIndex = 1;
156                     unicodeIndex = 0;
157                     while (unicodeIndex < numberOfChars) {
158                               if (compID == 16) {
159                                         /* First, place the high bits of the char
160                                          * into the byte stream.
161                                          */
162                                         UDFCompressed[byteIndex++] =
163                                             (unicode[unicodeIndex] & 0xFF00) >> 8;
164                               }
165                               /*Then place the low bits into the stream. */
166                               UDFCompressed[byteIndex++] =
167                                   unicode[unicodeIndex] & 0x00FF;
168                               unicodeIndex++;
169                     }
170           }
171           return(byteIndex);
172 }
173 
174 /*****************************************************************************/
175 /*
176  * CRC 010041
177  */
178 static unsigned short crc_table[256] = {
179           0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
180           0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
181           0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
182           0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
183           0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
184           0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
185           0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
186           0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
187           0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
188           0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
189           0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
190           0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
191           0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
192           0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
193           0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
194           0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
195           0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
196           0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
197           0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
198           0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
199           0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
200           0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
201           0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
202           0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
203           0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
204           0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
205           0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
206           0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
207           0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
208           0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
209           0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
210           0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
211 };
212 
213 unsigned short
udf_cksum(unsigned char * s,int n)214 udf_cksum(unsigned char *s, int n)
215 {
216           unsigned short crc=0;
217 
218           while (n-- > 0)
219                     crc = crc_table[(crc>>8 ^ *s++) & 0xff] ^ (crc<<8);
220           return crc;
221 }
222 
223 /* UNICODE Checksum */
224 unsigned short
udf_unicode_cksum(unsigned short * s,int n)225 udf_unicode_cksum(unsigned short *s, int n)
226 {
227           unsigned short crc=0;
228 
229           while (n-- > 0) {
230                     /* Take high order byte first--corresponds to a big endian
231                      * byte stream.
232                      */
233                     crc = crc_table[(crc>>8 ^ (*s>>8)) & 0xff] ^ (crc<<8);
234                     crc = crc_table[(crc>>8 ^ (*s++ & 0xff)) & 0xff] ^ (crc<<8);
235           }
236           return crc;
237 }
238 
239 #ifdef MAIN
240 unsigned char bytes[] = { 0x70, 0x6A, 0x77 };
241 
242 int
main(void)243 main(void)
244 {
245           unsigned short x;
246 
247           x = cksum(bytes, sizeof bytes);
248           kprintf("checksum: calculated=%4.4x, correct=%4.4x\en", x, 0x3299);
249           exit(0);
250 }
251 #endif
252 
253 /*****************************************************************************/
254 #ifdef NEEDS_ISPRINT
255 /***********************************************************************
256  * OSTA UDF compliant file name translation routine for OS/2,
257  * Windows 95, Windows NT, Macintosh and UNIX.
258  * Copyright 1995 Micro Design International, Inc.
259  * Written by Jason M. Rinn.
260  * Micro Design International gives permission for the free use of the
261  * following source code.
262  */
263 
264 /***********************************************************************
265  * To use these routines with different operating systems.
266  *
267  * OS/2
268  * Define OS2
269  * Define MAXLEN = 254
270  *
271  * Windows 95
272  * Define WIN_95
273  * Define MAXLEN = 255
274  *
275  * Windows NT
276  * Define WIN_NT
277  * Define MAXLEN = 255
278  *
279  * Macintosh:
280  * Define MAC.
281  * Define MAXLEN = 31.
282  *
283  * UNIX
284  * Define UNIX.
285  * Define MAXLEN as specified by unix version.
286  */
287 
288 #define   ILLEGAL_CHAR_MARK   0x005F
289 #define   CRC_MARK  0x0023
290 #define   EXT_SIZE  5
291 #define   TRUE      1
292 #define   FALSE     0
293 #define   PERIOD    0x002E
294 #define   SPACE     0x0020
295 
296 /*** PROTOTYPES ***/
297 int IsIllegal(unicode_t ch);
298 
299 /* Define a function or macro which determines if a Unicode character is
300  * printable under your implementation.
301  */
302 int UnicodeIsPrint(unicode_t);
303 
304 /***********************************************************************
305  * Translates a long file name to one using a MAXLEN and an illegal
306  * char set in accord with the OSTA requirements. Assumes the name has
307  * already been translated to Unicode.
308  *
309  * RETURN VALUE
310  *
311  * Number of unicode characters in translated name.
312  */
UDFTransName(unicode_t * newName,unicode_t * udfName,int udfLen)313 int UDFTransName(
314           unicode_t *newName, /* (Output)Translated name. Must be of length
315                                          * MAXLEN */
316           unicode_t *udfName, /* (Input) Name from UDF volume.*/
317           int udfLen)                   /* (Input) Length of UDF Name. */
318 {
319           int index, newIndex = 0, needsCRC = FALSE;
320           int extIndex = 0, newExtIndex = 0, hasExt = FALSE;
321 #if defined OS2 || defined WIN_95 || defined WIN_NT
322           int trailIndex = 0;
323 #endif
324           unsigned short valueCRC;
325           unicode_t current;
326           const char hexChar[] = "0123456789ABCDEF";
327 
328           for (index = 0; index < udfLen; index++) {
329                     current = udfName[index];
330 
331                     if (IsIllegal(current) || !UnicodeIsPrint(current)) {
332                               needsCRC = TRUE;
333                               /* Replace Illegal and non-displayable chars with
334                                * underscore.
335                                */
336                               current = ILLEGAL_CHAR_MARK;
337                               /* Skip any other illegal or non-displayable
338                                * characters.
339                                */
340                               while(index+1 < udfLen && (IsIllegal(udfName[index+1])
341                                   || !UnicodeIsPrint(udfName[index+1]))) {
342                                         index++;
343                               }
344                     }
345 
346                     /* Record position of extension, if one is found. */
347                     if (current == PERIOD && (udfLen - index -1) <= EXT_SIZE) {
348                               if (udfLen == index + 1) {
349                                         /* A trailing period is NOT an extension. */
350                                         hasExt = FALSE;
351                               } else {
352                                         hasExt = TRUE;
353                                         extIndex = index;
354                                         newExtIndex = newIndex;
355                               }
356                     }
357 
358 #if defined OS2 || defined WIN_95 || defined WIN_NT
359                     /* Record position of last char which is NOT period or space. */
360                     else if (current != PERIOD && current != SPACE) {
361                               trailIndex = newIndex;
362                     }
363 #endif
364 
365                     if (newIndex < MAXLEN) {
366                               newName[newIndex++] = current;
367                     } else {
368                               needsCRC = TRUE;
369                     }
370           }
371 
372 #if defined OS2 || defined WIN_95 || defined WIN_NT
373           /* For OS2, 95 & NT, truncate any trailing periods and\or spaces. */
374           if (trailIndex != newIndex - 1) {
375                     newIndex = trailIndex + 1;
376                     needsCRC = TRUE;
377                     hasExt = FALSE; /* Trailing period does not make an
378                                          * extension. */
379           }
380 #endif
381 
382           if (needsCRC) {
383                     unicode_t ext[EXT_SIZE];
384                     int localExtIndex = 0;
385                     if (hasExt) {
386                               int maxFilenameLen;
387                               /* Translate extension, and store it in ext. */
388                               for(index = 0; index<EXT_SIZE &&
389                                   extIndex + index +1 < udfLen; index++ ) {
390                                         current = udfName[extIndex + index + 1];
391                                         if (IsIllegal(current) ||
392                                             !UnicodeIsPrint(current)) {
393                                                   needsCRC = 1;
394                                                   /* Replace Illegal and non-displayable
395                                                    * chars with underscore.
396                                                    */
397                                                   current = ILLEGAL_CHAR_MARK;
398                                                   /* Skip any other illegal or
399                                                    * non-displayable characters.
400                                                    */
401                                                   while(index + 1 < EXT_SIZE
402                                                       && (IsIllegal(udfName[extIndex +
403                                                       index + 2]) ||
404                                                       !isprint(udfName[extIndex +
405                                                       index + 2]))) {
406                                                             index++;
407                                                   }
408                                         }
409                                         ext[localExtIndex++] = current;
410                               }
411 
412                               /* Truncate filename to leave room for extension and
413                                * CRC.
414                                */
415                               maxFilenameLen = ((MAXLEN - 5) - localExtIndex - 1);
416                               if (newIndex > maxFilenameLen) {
417                                         newIndex = maxFilenameLen;
418                               } else {
419                                         newIndex = newExtIndex;
420                               }
421                     } else if (newIndex > MAXLEN - 5) {
422                               /*If no extension, make sure to leave room for CRC. */
423                               newIndex = MAXLEN - 5;
424                     }
425                     newName[newIndex++] = CRC_MARK; /* Add mark for CRC. */
426 
427                     /*Calculate CRC from original filename from FileIdentifier. */
428                     valueCRC = udf_unicode_cksum(udfName, udfLen);
429                     /* Convert 16-bits of CRC to hex characters. */
430                     newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
431                     newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
432                     newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
433                     newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
434 
435                     /* Place a translated extension at end, if found. */
436                     if (hasExt) {
437                               newName[newIndex++] = PERIOD;
438                               for (index = 0;index < localExtIndex ;index++ ) {
439                                         newName[newIndex++] = ext[index];
440                               }
441                     }
442           }
443           return(newIndex);
444 }
445 
446 #if defined OS2 || defined WIN_95 || defined WIN_NT
447 /***********************************************************************
448  * Decides if a Unicode character matches one of a list
449  * of ASCII characters.
450  * Used by OS2 version of IsIllegal for readability, since all of the
451  * illegal characters above 0x0020 are in the ASCII subset of Unicode.
452  * Works very similarly to the standard C function strchr().
453  *
454  * RETURN VALUE
455  *
456  * Non-zero if the Unicode character is in the given ASCII string.
457  */
UnicodeInString(unsigned char * string,unicode_t ch)458 int UnicodeInString(
459           unsigned char *string,        /* (Input) String to search through. */
460           unicode_t ch)                 /* (Input) Unicode char to search for. */
461 {
462           int found = FALSE;
463           while (*string != '\0' && found == FALSE) {
464                     /* These types should compare, since both are unsigned
465                      * numbers. */
466                     if (*string == ch) {
467                               found = TRUE;
468                     }
469                     string++;
470           }
471           return(found);
472 }
473 #endif /* OS2 */
474 
475 /***********************************************************************
476  * Decides whether the given character is illegal for a given OS.
477  *
478  * RETURN VALUE
479  *
480  * Non-zero if char is illegal.
481  */
IsIllegal(unicode_t ch)482 int IsIllegal(unicode_t ch)
483 {
484 #ifdef MAC
485           /* Only illegal character on the MAC is the colon. */
486           if (ch == 0x003A) {
487                     return(1);
488           } else {
489                     return(0);
490           }
491 
492 #elif defined UNIX
493           /* Illegal UNIX characters are NULL and slash. */
494           if (ch == 0x0000 || ch == 0x002F) {
495                     return(1);
496           } else {
497                     return(0);
498           }
499 
500 #elif defined OS2 || defined WIN_95 || defined WIN_NT
501           /* Illegal char's for OS/2 according to WARP toolkit. */
502           if (ch < 0x0020 || UnicodeInString("\\/:*?\"<>|", ch)) {
503                     return(1);
504           } else {
505                     return(0);
506           }
507 #endif
508 }
509 #endif
510