/* The Unicode utility.
   Written by Pieter J. Schoenmakers <tiggr@ics.ele.tue.nl>

   Copyright (C) 1996 Pieter J. Schoenmakers.

   This file is part of TOM.  TOM is distributed under the terms of the
   TOM License, a copy of which can be found in the TOM distribution; see
   the file LICENSE.

   $Id: uc.t,v 1.19 1998/01/06 00:34:42 tiggr Exp $  */

implementation class
uc: stdio, Constants
{
  <doc> The kind of thing we're supposed to do.  </doc>
  int to_do;

  <doc> The kind of thing we can do.  </doc>
  /* Create a predicate bitset.  */
  const CREATE_BITSET = 1;
  /* Create a map from 8-bit ISO to Unicode..  */
  const CREATE_MAP = 2;
  /* Create a case conversion table.  */
  const CREATE_CONVERSION = 3;
}

void
  usage
{
  [[err print ("usage: ", [Runtime program_name], " options action
options:
  -o file   output to the <file>
  -m name   <name> the 8-bit to unicode mapping file to read (when necessary)
  -u name   <name> the unicode data file to read (when necessary)
action: one of:
  predicate (bitset): digit letter numeric punctuation space isupper islower
  ISO to Unicode map: map
  case conversion table: upper lower title
note:
  depending on whether the `-m' option is present, the output is either
  for unicode use (i.e. 16-bit), or for use within the encoding of the
  mapping (i.e. 8-bit). ")] nl];

  [Runtime exit 1];
}

(String, int)
  nextArgument (Array, int) (arguments, i)
{
  if (i == [arguments length] - 1)
    [self usage];

  = (arguments[++i], i);
}

void
  set_to_do int a
{
  if (to_do != 0)
    [self usage];
  to_do = a;
}

int
  main Array arguments
{
  int i, line, n = [arguments length];
  selector predicate, other_case;
  String uc_name, iso_name, out_name;
  MutableObjectArray chars;
  MutableCharArray map;
  MutableByteString t;
  UnicodeCharacter ch;
  OutputStream f_out;
  InputStream f;

  for (i = 0; i < n; i++)
    {
      ByteString a = arguments[i];

      if (["map" equal a])
	[self set_to_do CREATE_MAP];
      else if (["letter" equal a])
	{
	  predicate = selector ("o_isLetter");
	  [self set_to_do CREATE_BITSET];
	}
      else if (["digit" equal a])
	{
	  predicate = selector ("o_isDigit");
	  [self set_to_do CREATE_BITSET];
	}
      else if (["numeric" equal a])
	{
	  predicate = selector ("o_isNumeric");
	  [self set_to_do CREATE_BITSET];
	}
      else if (["punctuation" equal a])
	{
	  predicate = selector ("o_isPunctuation");
	  [self set_to_do CREATE_BITSET];
	}
      else if (["space" equal a])
	{
	  predicate = selector ("o_isSpace");
	  [self set_to_do CREATE_BITSET];
	}
      else if (["isupper" equal a])
	{
	  predicate = selector ("o_isUpper");
	  [self set_to_do CREATE_BITSET];
	}
      else if (["islower" equal a])
	{
	  predicate = selector ("o_isLower");
	  [self set_to_do CREATE_BITSET];
	}
      else if (["upper" equal a])
	{
	  other_case = selector ("c_upper");
	  [self set_to_do CREATE_CONVERSION];
	}
      else if (["lower" equal a])
	{
	  other_case = selector ("c_lower");
	  [self set_to_do CREATE_CONVERSION];
	}
      else if (["title" equal a])
	{
	  other_case = selector ("c_title");
	  [self set_to_do CREATE_CONVERSION];
	}
      else if (["-o" equal a])
	(out_name, i) = [self nextArgument (arguments, i)];
      else if (["-u" equal a])
	(uc_name, i) = [self nextArgument (arguments, i)];
      else if (["-m" equal a])
	(iso_name, i) = [self nextArgument (arguments, i)];
      else
	[self usage];
    }

  if (!to_do)
    [self usage];

  if (out_name)
    f_out = [File open out_name output: YES
		  flags: FILE_EXIST_TRUNCATE | FILE_NOT_EXIST_CREATE];
  else
    f_out = out;

  if (to_do != CREATE_MAP)
    {
      if (!uc_name)
	{
	  [[err print "-u needed for this action"] nl];
	  [Runtime exit 2];
	}

      f = [File open uc_name input: YES];
      f = [BufferedStream with f];

      chars = [MutableObjectArray withCapacity 65536];
      [chars resize [chars capacity]];

      do
	{
	  t = [f readLine];
	  line++;

	  if (!!t && [t length] != 0)
	    {
	      ch = [UnicodeCharacter with t];

	      if (ch == nil)
		[[err print (uc_name, ":", line, ": bad line:", t)] nl];
	      else
		chars[[ch unicode]] = ch;
	    }
	} while (t != nil);

      [f close];
      f = nil;
    }

  if (iso_name != nil)
    {
      f = [File open iso_name input: YES];
      f = [BufferedStream with f];

      map = [MutableCharArray withCapacity 256];
      [map resize [map capacity]];

      line = 0;
      do
	{
	  t = [f readLine];
	  line++;

	  /* Very dumb parser.  */
	  if (!!t && !![t length] && t[0] != '#')
	    {
	      Array fields = [t componentsSeparatedBy '\t' limit: 3];
	      int iso, uni;
	      boolean b, f;

	      if ([fields length] < 2)
		f = YES;

	      (iso, b, ) = [fields[0] integerValue (0, -1) allowCBases: YES];
	      if (!b)
		f = YES;

	      (uni, b, ) = [fields[1] integerValue (0, -1) allowCBases: YES];
	      if (!b)
		f = YES;

	      if (iso < 0 || iso > 0xff || uni < 0 || uni > 0xffff)
		f = YES;

	      if (f)
		[[err print (iso_name, ":", line, ": bad line: ", t)] nl];
	      else
		map[iso] = char (uni);
	    }
	} while (t != nil);

      [f close];
      f = nil;

      /* Fill any unnamed characters with themselves.  */
      for (i = 0; i < 256; i++)
	if (map[i] == char (0))
	  map[i] = char (i);
    }

  if (to_do == CREATE_MAP)
    {
      MutableByteArray byte_map = [MutableByteArray withCapacity 512];

      [byte_map resize [byte_map capacity]];
      for (i = 0; i < 256; i++)
	{
	  char uni = map[i];

	  byte_map[2 * i] = byte (uni / 256);
	  byte_map[2 * i + 1] = byte (uni % 256);
	}

      [f_out writeBytes byte_map];
    }
  else if (to_do == CREATE_BITSET)
    {
      MutableByteArray set = [MutableByteArray new];

      if (map != nil)
	{
	  [set resize 32];

	  for (i = 0; i < 256; i++)
	    {
	      char uni = map[i];

	      ch = chars [uni];
	      if (ch != nil && [ch perform predicate])
		{
		  byte b = set[i / 8];
		  set[i / 8] = b | byte (1 << (i % 8));
		}
	    }
	}
      else
	{
	  [set resize 8192];

	  for (i = 0; i < 65536; i++)
	    {
	      ch = chars [i];

	      if (ch != nil && [ch perform predicate])
		{
		  char u = [ch unicode];
		  byte b = set[u / 8];
		  set[u / 8] = b | byte (1 << (u % 8));
		}
	    }
	}

      [f_out writeBytes set];
    }
  else if (to_do == CREATE_CONVERSION)
    {
      char c, o, previous, base, other;
      int num;

      if (map != nil)
	{
	  MutableByteArray bytes = [MutableByteArray new];

	  for (i = 0; i < 256; i++)
	    {
	      char uni = map[i];
	      ch = chars[uni];
	      char other = !ch ? char (0) : [ch perform other_case];
	      byte b;

	      if (other != char (0))
		{
		  int j;

		  // This can be done faster...
		  for (j = 0; j < 256; j++)
		    {
		      if (map[j] == other)
			{
			  b = byte (j);
			  break;
			}
		    }
		}

	      // CCC
	      b = !b ? byte (i) : b;
	      bytes[i] = b;
	    }

	  [f_out writeBytes bytes];
	}
      else
	{
	  for (i = 0; i < 65536; i++)
	    {
	      ch = chars[i];

	      if (!ch)
		o = char (0);
	      else
		{
		  c = [ch unicode];
		  o = [ch perform other_case];
		}

	      if ((!o && base != 0)
		  || o != 0 && (c != previous + char (1)
				|| o != other + char (num)))
		{
		  if (base != 0)
		    {
		      [[f_out print (int (base), ' ', int (other), ' ',
				     num)] nl];
		      base = char (0);
		    }
		}

	      if (o != 0)
		if (!base)
		  {
		    base = c;
		    other = o;
		    num = 1;
		  }
		else
		  num++;

	      previous = c;
	    }
	}
    }

  [f_out close];

  = 0;
}

end;

implementation instance uc

// For the moment, you do not want to know why this is here.
// Tue Jan  6 00:47:23 1998, tiggr@tnt.ics.ele.tue.nl
All
  substring (int, int) (s, l)
{
}

end;
