MagickCore  6.9.13-46
Convert, Edit, Or Compose Bitmap Images
token.c
1 /*
2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3 % %
4 % %
5 % %
6 % TTTTT OOO K K EEEEE N N %
7 % T O O K K E NN N %
8 % T O O KKK EEE N N N %
9 % T O O K K E N NN %
10 % T OOO K K EEEEE N N %
11 % %
12 % %
13 % MagickCore Token Methods %
14 % %
15 % Software Design %
16 % Cristy %
17 % January 1993 %
18 % %
19 % %
20 % Copyright 1999 ImageMagick Studio LLC, a non-profit organization %
21 % dedicated to making software imaging solutions freely available. %
22 % %
23 % You may not use this file except in compliance with the License. You may %
24 % obtain a copy of the License at %
25 % %
26 % https://imagemagick.org/license/ %
27 % %
28 % Unless required by applicable law or agreed to in writing, software %
29 % distributed under the License is distributed on an "AS IS" BASIS, %
30 % WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. %
31 % See the License for the specific language governing permissions and %
32 % limitations under the License. %
33 % %
34 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
35 %
36 %
37 %
38 */
39 
40 /*
41  Include declarations.
42 */
43 #include "magick/studio.h"
44 #include "magick/exception.h"
45 #include "magick/exception-private.h"
46 #include "magick/image.h"
47 #include "magick/image-private.h"
48 #include "magick/locale-private.h"
49 #include "magick/memory_.h"
50 #include "magick/string_.h"
51 #include "magick/string-private.h"
52 #include "magick/token.h"
53 #include "magick/token-private.h"
54 #include "magick/utility.h"
55 
56 /*
57  Typedef declarations.
58 */
59 struct _TokenInfo
60 {
61  int
62  state;
63 
64  MagickStatusType
65  flag;
66 
67  ssize_t
68  offset;
69 
70  char
71  quote;
72 
73  size_t
74  signature;
75 };
76 
77 /*
78 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
79 % %
80 % %
81 % %
82 % A c q u i r e T o k e n I n f o %
83 % %
84 % %
85 % %
86 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
87 %
88 % AcquireTokenInfo() allocates the TokenInfo structure.
89 %
90 % The format of the AcquireTokenInfo method is:
91 %
92 % TokenInfo *AcquireTokenInfo()
93 %
94 */
95 MagickExport TokenInfo *AcquireTokenInfo(void)
96 {
97  TokenInfo
98  *token_info;
99 
100  token_info=(TokenInfo *) AcquireMagickMemory(sizeof(*token_info));
101  if (token_info == (TokenInfo *) NULL)
102  ThrowFatalException(ResourceLimitFatalError,"MemoryAllocationFailed");
103  token_info->signature=MagickCoreSignature;
104  return(token_info);
105 }
106 
107 /*
108 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
109 % %
110 % %
111 % %
112 % D e s t r o y T o k e n I n f o %
113 % %
114 % %
115 % %
116 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
117 %
118 % DestroyTokenInfo() deallocates memory associated with an TokenInfo
119 % structure.
120 %
121 % The format of the DestroyTokenInfo method is:
122 %
123 % TokenInfo *DestroyTokenInfo(TokenInfo *token_info)
124 %
125 % A description of each parameter follows:
126 %
127 % o token_info: Specifies a pointer to an TokenInfo structure.
128 %
129 */
130 MagickExport TokenInfo *DestroyTokenInfo(TokenInfo *token_info)
131 {
132  assert(token_info != (TokenInfo *) NULL);
133  assert(token_info->signature == MagickCoreSignature);
134  if (IsEventLogging() != MagickFalse)
135  (void) LogMagickEvent(TraceEvent,GetMagickModule(),"...");
136  token_info->signature=(~MagickCoreSignature);
137  token_info=(TokenInfo *) RelinquishMagickMemory(token_info);
138  return(token_info);
139 }
140 
141 /*
142 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
143 % %
144 % %
145 % %
146 + G e t N e x t T o k e n %
147 % %
148 % %
149 % %
150 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
151 %
152 % GetNextToken() gets a token from the token stream. A token is defined as
153 % a sequence of characters delimited by whitespace (e.g. clip-path), a
154 % sequence delimited with quotes (.e.g "Quote me"), or a sequence enclosed in
155 % parenthesis (e.g. rgb(0,0,0)). GetNextToken() also recognizes these
156 % separator characters: ':', '=', ',', and ';'. GetNextToken() returns the
157 % length of the consumed token.
158 %
159 % The format of the GetNextToken method is:
160 %
161 % size_t GetNextToken(const char *magick_restrict start,
162 % const char **magick_restrict end,const size_t extent,
163 % char *magick_restrict token)
164 %
165 % A description of each parameter follows:
166 %
167 % o start: the start of the token sequence.
168 %
169 % o end: point to the end of the token sequence.
170 %
171 % o extent: maximum extent of the token.
172 %
173 % o token: copy the token to this buffer.
174 %
175 */
176 MagickExport magick_hot_spot size_t GetNextToken(
177  const char *magick_restrict start,const char **magick_restrict end,
178  const size_t extent,char *magick_restrict token)
179 {
180  double
181  value;
182 
183  char
184  *magick_restrict q;
185 
186  const char
187  *magick_restrict p;
188 
189  ssize_t
190  i;
191 
192  assert(start != (const char *) NULL);
193  assert(token != (char *) NULL);
194  i=0;
195  p=start;
196  while ((isspace((int) ((unsigned char) *p)) != 0) && (*p != '\0'))
197  p++;
198  switch (*p)
199  {
200  case '\0':
201  break;
202  case '"':
203  case '\'':
204  case '`':
205  case '{':
206  {
207  char
208  escape;
209 
210  switch (*p)
211  {
212  case '"': escape='"'; break;
213  case '\'': escape='\''; break;
214  case '`': escape='\''; break;
215  case '{': escape='}'; break;
216  default: escape=(*p); break;
217  }
218  for (p++; *p != '\0'; p++)
219  {
220  if ((*p == '\\') && ((*(p+1) == escape) || (*(p+1) == '\\')))
221  p++;
222  else
223  if (*p == escape)
224  {
225  p++;
226  break;
227  }
228  if (i < (ssize_t) (extent-1))
229  token[i++]=(*p);
230  if ((size_t) (p-start) >= (extent-1))
231  break;
232  }
233  break;
234  }
235  case '/':
236  {
237  if (i < (ssize_t) (extent-1))
238  token[i++]=(*p);
239  p++;
240  if ((*p == '>') || (*p == '/'))
241  {
242  if (i < (ssize_t) (extent-1))
243  token[i++]=(*p);
244  p++;
245  }
246  break;
247  }
248  default:
249  {
250  char
251  *q;
252 
253  value=StringToDouble(p,&q);
254  (void) value;
255  if ((p != q) && (*p != ','))
256  {
257  for ( ; (p < q) && (*p != ','); p++)
258  {
259  if (i < (ssize_t) (extent-1))
260  token[i++]=(*p);
261  if ((size_t) (p-start) >= (extent-1))
262  break;
263  }
264  if (*p == '%')
265  {
266  if (i < (ssize_t) (extent-1))
267  token[i++]=(*p);
268  p++;
269  }
270  break;
271  }
272  if ((*p != '\0') && (isalpha((int) ((unsigned char) *p)) == 0) &&
273  (*p != *DirectorySeparator) && (*p != '#') && (*p != '<'))
274  {
275  if (i < (ssize_t) (extent-1))
276  token[i++]=(*p);
277  p++;
278  break;
279  }
280  for ( ; *p != '\0'; p++)
281  {
282  if (((isspace((int) ((unsigned char) *p)) != 0) || (*p == '=') ||
283  (*p == ',') || (*p == ':') || (*p == ';')) && (*(p-1) != '\\'))
284  break;
285  if ((i > 0) && (*p == '<'))
286  break;
287  if (i < (ssize_t) (extent-1))
288  token[i++]=(*p);
289  if (*p == '>')
290  break;
291  if (*p == '(')
292  {
293  for (p++; *p != '\0'; p++)
294  {
295  if (i < (ssize_t) (extent-1))
296  token[i++]=(*p);
297  if ((*p == ')') && (*(p-1) != '\\'))
298  break;
299  if ((size_t) (p-start) >= (extent-1))
300  break;
301  }
302  if (*p == '\0')
303  break;
304  }
305  if ((size_t) (p-start) >= (extent-1))
306  break;
307  }
308  break;
309  }
310  }
311  token[i]='\0';
312  if (LocaleNCompare(token,"url(#",5) == 0)
313  {
314  q=strrchr(token,')');
315  if (q != (char *) NULL)
316  {
317  *q='\0';
318  (void) memmove(token,token+5,(size_t) (q-token-4));
319  }
320  }
321  while (isspace((int) ((unsigned char) *p)) != 0)
322  p++;
323  if (end != (const char **) NULL)
324  *end=(const char *) p;
325  return(p-start+1);
326 }
327 
328 /*
329 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
330 % %
331 % %
332 % %
333 % G l o b E x p r e s s i o n %
334 % %
335 % %
336 % %
337 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
338 %
339 % GlobExpression() returns MagickTrue if the expression matches the pattern.
340 %
341 % The format of the GlobExpression function is:
342 %
343 % MagickBooleanType GlobExpression(const char *magick_restrict expression,
344 % const char *magick_restrict pattern,
345 % const MagickBooleanType case_insensitive)
346 %
347 % A description of each parameter follows:
348 %
349 % o expression: Specifies a pointer to a text string containing a file name.
350 %
351 % o pattern: Specifies a pointer to a text string containing a pattern.
352 %
353 % o case_insensitive: set to MagickTrue to ignore the case when matching
354 % an expression.
355 %
356 */
357 
358 static MagickBooleanType GlobExpression_(const char *magick_restrict expression,
359  const char *magick_restrict pattern,const MagickBooleanType case_insensitive,
360  const size_t depth)
361 {
362  if (depth > MagickMaxRecursionDepth)
363  {
364  errno=EOVERFLOW;
365  return(MagickFalse);
366  }
367  /*
368  Empty pattern or single '*' always matches.
369  */
370  if (pattern == (const char *) NULL)
371  return(MagickTrue);
372  if (GetUTFCode(pattern) == 0)
373  return(MagickTrue);
374  if ((GetUTFCode(pattern) == '*') &&
375  (GetUTFCode(pattern+GetUTFOctets(pattern)) == 0))
376  return(MagickTrue);
377  if ((strchr(pattern,'{') == NULL) &&
378  (strchr(pattern,'*') == NULL) &&
379  (strchr(pattern,'?') == NULL))
380  {
381  char
382  path[MagickPathExtent]= { 0 };
383 
384  /*
385  If no glob characters exist, ensure no subimage specifier.
386  */
387  GetPathComponent(pattern,SubimagePath,path);
388  if (*path != '\0')
389  return(MagickFalse);
390  }
391  while (GetUTFCode(pattern) != 0)
392  {
393  int
394  ecode = GetUTFCode(expression),
395  pcode = GetUTFCode(pattern);
396 
397  if ((ecode == 0) && (pcode != '*') && (pcode != '{'))
398  break;
399  switch (pcode)
400  {
401  case '*':
402  {
403  do
404  {
405  /*
406  Skip consecutive '*'.
407  */
408  pattern+=GetUTFOctets(pattern);
409  }
410  while (GetUTFCode(pattern) == '*');
411  while (1)
412  {
413  /*
414  Try to match at each position.
415  */
416  if (GlobExpression_(expression,pattern,case_insensitive,depth+1) != MagickFalse)
417  {
418  /*
419  Consume rest of expression and pattern.
420  */
421  while (GetUTFCode(expression) != 0)
422  expression+=GetUTFOctets(expression);
423  while (GetUTFCode(pattern) != 0)
424  pattern+=GetUTFOctets(pattern);
425  return(MagickTrue);
426  }
427  if (GetUTFCode(expression) == 0)
428  break;
429  expression+=GetUTFOctets(expression);
430  }
431  return(MagickFalse);
432  }
433  case '?':
434  {
435  if (ecode == 0)
436  return(MagickFalse);
437  pattern+=GetUTFOctets(pattern);
438  expression+=GetUTFOctets(expression);
439  break;
440  }
441  case '[':
442  {
443  const char
444  *p = pattern+GetUTFOctets(pattern),
445  *q = pattern+GetUTFOctets(pattern);
446 
447  MagickBooleanType
448  matched = MagickFalse;
449 
450  if (ecode == 0)
451  return(MagickFalse);
452  while ((GetUTFCode(q) != 0) && (GetUTFCode(q) != ']'))
453  q+=GetUTFOctets(q);
454  if (GetUTFCode(q) == 0)
455  return(MagickFalse); /* malformed */
456  while (p < q)
457  {
458  const char
459  *next;
460 
461  int
462  code = GetUTFCode(p);
463 
464  size_t
465  octets = GetUTFOctets(p);
466 
467  if (code == '\\')
468  {
469  p+=octets;
470  code=GetUTFCode(p);
471  octets=GetUTFOctets(p);
472  }
473  next=p+octets;
474  if ((next < q) && (GetUTFCode(next) == '-'))
475  {
476  int
477  ncode;
478 
479  next+=GetUTFOctets(next);
480  ncode=GetUTFCode(next);
481  if (ncode == '\\')
482  {
483  next+=GetUTFOctets(next);
484  ncode=GetUTFCode(next);
485  }
486  if ((ecode >= code) && (ecode <= ncode))
487  matched=MagickTrue;
488  p=next+GetUTFOctets(next);
489  }
490  else
491  {
492  if (ecode == code)
493  matched=MagickTrue;
494  p+=octets;
495  }
496  }
497  /*
498  Skip consecutive '*'.
499  */
500  if (matched == MagickFalse)
501  return(MagickFalse);
502  pattern=q+GetUTFOctets(q); /* skip ']' */
503  expression+=GetUTFOctets(expression);
504  break;
505  }
506  case '{':
507  {
508  char
509  *a,
510  *alternative;
511 
512  const char
513  *p,
514  *q;
515 
516  size_t
517  remaining = MagickPathExtent;
518 
519  pattern+=GetUTFOctets(pattern); /* Skip '{' */
520  if (GetUTFCode(pattern) == 0)
521  return(MagickFalse);
522  /*
523  End of brace expression: append remaining pattern.
524  */
525  p=pattern;
526  while ((GetUTFCode(p) != 0) && (GetUTFCode(p) != '}'))
527  {
528 #if !defined(MAGICKCORE_WINDOWS_SUPPORT) || defined(__CYGWIN__)
529  if (GetUTFCode(p) == '\\')
530  {
531  p+=GetUTFOctets(p);
532  if (GetUTFCode(p) == 0)
533  break;
534  }
535 #endif
536  p+=GetUTFOctets(p);
537  }
538  if (GetUTFCode(p) != '}')
539  return(MagickFalse); /* malformed */
540  q=p+GetUTFOctets(p);
541  alternative=AcquireString(pattern);
542  a=alternative;
543  while (1)
544  {
545  int
546  code = GetUTFCode(pattern);
547 
548  size_t
549  octets;
550 
551  if ((code == 0) || (code == ',') || (code == '}'))
552  {
553  char
554  *subpattern;
555 
556  MagickBooleanType
557  match;
558 
559  /*
560  Try alternative as a full sub-pattern.
561  */
562  *a='\0';
563  subpattern=AcquireString(alternative);
564  if (ConcatenateString(&subpattern,q) == MagickFalse)
565  {
566  subpattern=DestroyString(subpattern);
567  alternative=DestroyString(alternative);
568  return(MagickFalse);
569  }
570  match=GlobExpression_(expression,subpattern,case_insensitive,
571  depth+1);
572  subpattern=DestroyString(subpattern);
573  if (match != MagickFalse)
574  {
575  /*
576  Consume rest of expression and pattern.
577  */
578  while (GetUTFCode(expression) != 0)
579  expression+=GetUTFOctets(expression);
580  pattern=q;
581  while (GetUTFCode(pattern) != 0)
582  pattern+=GetUTFOctets(pattern);
583  alternative=DestroyString(alternative);
584  return(MagickTrue);
585  }
586  /*
587  Reset buffer for next alternative.
588  */
589  a=alternative;
590  remaining=MagickPathExtent;
591  if (code == ',')
592  {
593  pattern+=GetUTFOctets(pattern); /* skip ',' */
594  continue;
595  }
596  break; /* '}' or end */
597  }
598  /*
599  Copy UTF-8 sequence into alternative.
600  */
601  octets=GetUTFOctets(pattern);
602  if ((octets == 0) || (octets >= remaining))
603  break;
604  (void) memcpy(a,pattern,octets);
605  a+=octets;
606  remaining-=octets;
607  pattern+=octets;
608  }
609  alternative=DestroyString(alternative);
610  return(MagickFalse);
611  }
612 #if !defined(MAGICKCORE_WINDOWS_SUPPORT) || defined(__CYGWIN__)
613  case '\\':
614  {
615  pattern+=GetUTFOctets(pattern);
616  if (GetUTFCode(pattern) == 0)
617  return(MagickFalse);
618  magick_fallthrough;
619  }
620 #endif
621  default:
622  {
623  int
624  ec = ecode,
625  pc = pcode;
626 
627  if (ecode == 0)
628  return(MagickFalse);
629  if (case_insensitive != MagickFalse)
630  {
631  pc=LocaleToLowercase(pc);
632  ec=LocaleToLowercase(ec);
633  }
634  if (pc != ec)
635  return(MagickFalse);
636  pattern+=GetUTFOctets(pattern);
637  expression+=GetUTFOctets(expression);
638  break;
639  }
640  }
641  }
642  while (GetUTFCode(pattern) == '*')
643  pattern+=GetUTFOctets(pattern);
644  return(((GetUTFCode(expression) == 0) &&
645  (GetUTFCode(pattern) == 0)) ? MagickTrue : MagickFalse);
646 }
647 
648 MagickExport MagickBooleanType GlobExpression(
649  const char *magick_restrict expression,const char *magick_restrict pattern,
650  const MagickBooleanType case_insensitive)
651 {
652  return(GlobExpression_(expression,pattern,case_insensitive,0));
653 }
654 
655 /*
656 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
657 % %
658 % %
659 % %
660 + I s G l o b %
661 % %
662 % %
663 % %
664 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
665 %
666 % IsGlob() returns MagickTrue if the path specification contains a globbing
667 % pattern.
668 %
669 % The format of the IsGlob method is:
670 %
671 % MagickBooleanType IsGlob(const char *geometry)
672 %
673 % A description of each parameter follows:
674 %
675 % o path: the path.
676 %
677 */
678 MagickExport MagickBooleanType IsGlob(const char *path)
679 {
680  MagickBooleanType
681  status = MagickFalse;
682 
683  const char
684  *p;
685 
686  if (IsPathAccessible(path) != MagickFalse)
687  return(MagickFalse);
688  for (p=path; *p != '\0'; p++)
689  {
690  switch (*p)
691  {
692  case '*':
693  case '?':
694  case '{':
695  case '}':
696  case '[':
697  case ']':
698  {
699  status=MagickTrue;
700  break;
701  }
702  default:
703  break;
704  }
705  }
706  return(status);
707 }
708 
709 /*
710 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
711 % %
712 % %
713 % %
714 % I s M a g i c k T r u e %
715 % %
716 % %
717 % %
718 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
719 %
720 % IsMagickTrue() returns MagickTrue if the value is "true", "on", "yes" or
721 % "1".
722 %
723 % The format of the IsMagickTrue method is:
724 %
725 % MagickBooleanType IsMagickTrue(const char *value)
726 %
727 % A description of each parameter follows:
728 %
729 % o option: either MagickTrue or MagickFalse depending on the value
730 % parameter.
731 %
732 % o value: Specifies a pointer to a character array.
733 %
734 */
735 MagickExport MagickBooleanType IsMagickTrue(const char *value)
736 {
737  if (value == (const char *) NULL)
738  return(MagickFalse);
739  if (LocaleCompare(value,"true") == 0)
740  return(MagickTrue);
741  if (LocaleCompare(value,"on") == 0)
742  return(MagickTrue);
743  if (LocaleCompare(value,"yes") == 0)
744  return(MagickTrue);
745  if (LocaleCompare(value,"1") == 0)
746  return(MagickTrue);
747  return(MagickFalse);
748 }
749 
750 /*
751 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
752 % %
753 % %
754 % %
755 % T o k e n i z e r %
756 % %
757 % %
758 % %
759 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
760 %
761 % Tokenizer() is a generalized, finite state token parser. It extracts tokens
762 % one at a time from a string of characters. The characters used for white
763 % space, for break characters, and for quotes can be specified. Also,
764 % characters in the string can be preceded by a specifiable escape character
765 % which removes any special meaning the character may have.
766 %
767 % Here is some terminology:
768 %
769 % o token: A single unit of information in the form of a group of
770 % characters.
771 %
772 % o white space: Apace that gets ignored (except within quotes or when
773 % escaped), like blanks and tabs. in addition, white space terminates a
774 % non-quoted token.
775 %
776 % o break set: One or more characters that separates non-quoted tokens.
777 % Commas are a common break character. The usage of break characters to
778 % signal the end of a token is the same as that of white space, except
779 % multiple break characters with nothing or only white space between
780 % generate a null token for each two break characters together.
781 %
782 % For example, if blank is set to be the white space and comma is set to
783 % be the break character, the line
784 %
785 % A, B, C , , DEF
786 %
787 % ... consists of 5 tokens:
788 %
789 % 1) "A"
790 % 2) "B"
791 % 3) "C"
792 % 4) "" (the null string)
793 % 5) "DEF"
794 %
795 % o Quote character: A character that, when surrounding a group of other
796 % characters, causes the group of characters to be treated as a single
797 % token, no matter how many white spaces or break characters exist in
798 % the group. Also, a token always terminates after the closing quote.
799 % For example, if ' is the quote character, blank is white space, and
800 % comma is the break character, the following string
801 %
802 % A, ' B, CD'EF GHI
803 %
804 % ... consists of 4 tokens:
805 %
806 % 1) "A"
807 % 2) " B, CD" (note the blanks & comma)
808 % 3) "EF"
809 % 4) "GHI"
810 %
811 % The quote characters themselves do not appear in the resultant
812 % tokens. The double quotes are delimiters i use here for
813 % documentation purposes only.
814 %
815 % o Escape character: A character which itself is ignored but which
816 % causes the next character to be used as is. ^ and \ are often used
817 % as escape characters. An escape in the last position of the string
818 % gets treated as a "normal" (i.e., non-quote, non-white, non-break,
819 % and non-escape) character. For example, assume white space, break
820 % character, and quote are the same as in the above examples, and
821 % further, assume that ^ is the escape character. Then, in the string
822 %
823 % ABC, ' DEF ^' GH' I ^ J K^ L ^
824 %
825 % ... there are 7 tokens:
826 %
827 % 1) "ABC"
828 % 2) " DEF ' GH"
829 % 3) "I"
830 % 4) " " (a lone blank)
831 % 5) "J"
832 % 6) "K L"
833 % 7) "^" (passed as is at end of line)
834 %
835 % The format of the Tokenizer method is:
836 %
837 % int Tokenizer(TokenInfo *token_info,const unsigned flag,char *token,
838 % const size_t max_token_length,const char *line,const char *white,
839 % const char *break_set,const char *quote,const char escape,
840 % char *breaker,int *next,char *quoted)
841 %
842 % A description of each parameter follows:
843 %
844 % o flag: right now, only the low order 3 bits are used.
845 %
846 % 1 => convert non-quoted tokens to upper case
847 % 2 => convert non-quoted tokens to lower case
848 % 0 => do not convert non-quoted tokens
849 %
850 % o token: a character string containing the returned next token
851 %
852 % o max_token_length: the maximum size of "token". Characters beyond
853 % "max_token_length" are truncated.
854 %
855 % o string: the string to be parsed.
856 %
857 % o white: a string of the valid white spaces. example:
858 %
859 % char whitesp[]={" \t"};
860 %
861 % blank and tab will be valid white space.
862 %
863 % o break: a string of the valid break characters. example:
864 %
865 % char breakch[]={";,"};
866 %
867 % semicolon and comma will be valid break characters.
868 %
869 % o quote: a string of the valid quote characters. An example would be
870 %
871 % char whitesp[]={"'\"");
872 %
873 % (this causes single and double quotes to be valid) Note that a
874 % token starting with one of these characters needs the same quote
875 % character to terminate it.
876 %
877 % for example:
878 %
879 % "ABC '
880 %
881 % is unterminated, but
882 %
883 % "DEF" and 'GHI'
884 %
885 % are properly terminated. Note that different quote characters
886 % can appear on the same line; only for a given token do the quote
887 % characters have to be the same.
888 %
889 % o escape: the escape character (NOT a string ... only one
890 % allowed). Use zero if none is desired.
891 %
892 % o breaker: the break character used to terminate the current
893 % token. If the token was quoted, this will be the quote used. If
894 % the token is the last one on the line, this will be zero.
895 %
896 % o next: this variable points to the first character of the
897 % next token. it gets reset by "tokenizer" as it steps through the
898 % string. Set it to 0 upon initialization, and leave it alone
899 % after that. You can change it if you want to jump around in the
900 % string or re-parse from the beginning, but be careful.
901 %
902 % o quoted: set to True if the token was quoted and MagickFalse
903 % if not. You may need this information (for example: in C, a
904 % string with quotes around it is a character string, while one
905 % without is an identifier).
906 %
907 % o result: 0 if we haven't reached EOS (end of string), and 1
908 % if we have.
909 %
910 */
911 
912 #define IN_WHITE 0
913 #define IN_TOKEN 1
914 #define IN_QUOTE 2
915 #define IN_OZONE 3
916 
917 static ssize_t sindex(int c,const char *string)
918 {
919  const char
920  *p;
921 
922  for (p=string; *p != '\0'; p++)
923  if (c == (int) (*p))
924  return((ssize_t) (p-string));
925  return(-1);
926 }
927 
928 static void StoreToken(TokenInfo *token_info,char *string,
929  size_t max_token_length,int c)
930 {
931  ssize_t
932  i;
933 
934  if ((token_info->offset < 0) ||
935  ((size_t) token_info->offset >= (max_token_length-1)))
936  return;
937  i=token_info->offset++;
938  string[i]=(char) c;
939  if (token_info->state == IN_QUOTE)
940  return;
941  switch (token_info->flag & 0x03)
942  {
943  case 1:
944  {
945  string[i]=(char) LocaleToUppercase(c);
946  break;
947  }
948  case 2:
949  {
950  string[i]=(char) LocaleToLowercase(c);
951  break;
952  }
953  default:
954  break;
955  }
956 }
957 
958 MagickExport int Tokenizer(TokenInfo *token_info,const unsigned flag,
959  char *token,const size_t max_token_length,const char *line,const char *white,
960  const char *break_set,const char *quote,const char escape,char *breaker,
961  int *next,char *quoted)
962 {
963  int
964  c;
965 
966  ssize_t
967  i;
968 
969  *breaker='\0';
970  *quoted='\0';
971  if (line[*next] == '\0')
972  return(1);
973  token_info->state=IN_WHITE;
974  token_info->quote=(char) MagickFalse;
975  token_info->flag=flag;
976  for (token_info->offset=0; (int) line[*next] != 0; (*next)++)
977  {
978  c=(int) line[*next];
979  i=sindex(c,break_set);
980  if (i >= 0)
981  {
982  switch (token_info->state)
983  {
984  case IN_WHITE:
985  case IN_TOKEN:
986  case IN_OZONE:
987  {
988  (*next)++;
989  *breaker=break_set[i];
990  token[token_info->offset]='\0';
991  return(0);
992  }
993  case IN_QUOTE:
994  {
995  StoreToken(token_info,token,max_token_length,c);
996  break;
997  }
998  }
999  continue;
1000  }
1001  i=sindex(c,quote);
1002  if (i >= 0)
1003  {
1004  switch (token_info->state)
1005  {
1006  case IN_WHITE:
1007  {
1008  token_info->state=IN_QUOTE;
1009  token_info->quote=quote[i];
1010  *quoted=(char) MagickTrue;
1011  break;
1012  }
1013  case IN_QUOTE:
1014  {
1015  if (quote[i] != token_info->quote)
1016  StoreToken(token_info,token,max_token_length,c);
1017  else
1018  {
1019  token_info->state=IN_OZONE;
1020  token_info->quote='\0';
1021  }
1022  break;
1023  }
1024  case IN_TOKEN:
1025  case IN_OZONE:
1026  {
1027  *breaker=(char) c;
1028  token[token_info->offset]='\0';
1029  return(0);
1030  }
1031  }
1032  continue;
1033  }
1034  i=sindex(c,white);
1035  if (i >= 0)
1036  {
1037  switch (token_info->state)
1038  {
1039  case IN_WHITE:
1040  case IN_OZONE:
1041  break;
1042  case IN_TOKEN:
1043  {
1044  token_info->state=IN_OZONE;
1045  break;
1046  }
1047  case IN_QUOTE:
1048  {
1049  StoreToken(token_info,token,max_token_length,c);
1050  break;
1051  }
1052  }
1053  continue;
1054  }
1055  if (c == (int) escape)
1056  {
1057  if (line[(*next)+1] == '\0')
1058  {
1059  *breaker='\0';
1060  StoreToken(token_info,token,max_token_length,c);
1061  (*next)++;
1062  token[token_info->offset]='\0';
1063  return(0);
1064  }
1065  switch (token_info->state)
1066  {
1067  case IN_WHITE:
1068  {
1069  (*next)--;
1070  token_info->state=IN_TOKEN;
1071  break;
1072  }
1073  case IN_TOKEN:
1074  case IN_QUOTE:
1075  {
1076  (*next)++;
1077  c=(int) line[*next];
1078  StoreToken(token_info,token,max_token_length,c);
1079  break;
1080  }
1081  case IN_OZONE:
1082  {
1083  token[token_info->offset]='\0';
1084  return(0);
1085  }
1086  }
1087  continue;
1088  }
1089  switch (token_info->state)
1090  {
1091  case IN_WHITE:
1092  {
1093  token_info->state=IN_TOKEN;
1094  StoreToken(token_info,token,max_token_length,c);
1095  break;
1096  }
1097  case IN_TOKEN:
1098  case IN_QUOTE:
1099  {
1100  StoreToken(token_info,token,max_token_length,c);
1101  break;
1102  }
1103  case IN_OZONE:
1104  {
1105  token[token_info->offset]='\0';
1106  return(0);
1107  }
1108  }
1109  }
1110  token[token_info->offset]='\0';
1111  return(0);
1112 }