package org.simantics.databoard.util.binary;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UTFDataFormatException;
import java.nio.charset.Charset;

/**
 * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>
 * 
 * The differences between standard UTF8 and Modified are the following:
 * <ul>
 * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format
 *     rather than 1-byte, so that the encoded strings never have
 *     embedded nulls.
 * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.
 * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>
 *     are represented in the form of surrogate pairs.
 * </ul>
 * 
 */
public class UTF8 {

	public static final Charset CHARSET = Charset.forName("utf-8");
	
	/**
	 * Get the number of bytes in an UTF-8 encoding of a string 
	 * 
	 * @param string
	 * @return byte length
	 */
	public static int getUTF8EncodingByteLength(String string)
	{
		// TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? 
		
		// Correct
		//return string.getBytes(UTF8).length;
		
		// http://en.wikipedia.org/wiki/UTF-8
		int result = 0;
		int length = string.length();
		for (int i=0; i<length; i++)
		{
			char c = string.charAt(i);
			if (c>=0 && c<=0x7f) {
				result += 1;
			} else if (c>=0x80 && c<=0x07ff) {
				result += 2;
			} else if (c>=0xD800 && c<=0xDFFF) {
				result += 1;
			} else if (c>=0x800 && c<=0xffff) {
				result += 3;
			}
			// Not really used as char is 16-bit
			else if (c>=0x10000 && c<=0x10ffff) {
				result += 4;
			} else if (c>=0x110000 && c<=0x1FFFFF) {
				result += 4;
			} else {
				// NOT IN RFC 3629
				result += 5;
			}
		}
		return result;				
	}
	

	/**
	 * Get the number of bytes in an Modified-UTF-8 encoding of a string 
	 * 
	 * @param str
	 * @return byte length
	 */
	public static int getModifiedUTF8EncodingByteLength(String str)
	{
        int strlen = str.length();
    	int utflen = 0;
    	int c = 0;
     
        /* use charAt instead of copying String to char array */
    	for (int i = 0; i < strlen; i++) {
                c = str.charAt(i);
    	    if ((c >= 0x0001) && (c <= 0x007F)) {
    		utflen++;
    	    } else if (c > 0x07FF) {
    		utflen += 3;
    	    } else {
    		utflen += 2;
    	    }
    	}
    	return utflen;
	}
    
	/**
	 * Write Modified-UTF8 to a stream.
	 * 
	 * @param out output stream 
	 * @param str string
	 * @throws IOException
	 */
    public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
    	// Copied from DataOutput
        int strlen = str.length();
        int c = 0;
        
        int i=0;
        for (i=0; i<strlen; i++) {
           c = str.charAt(i);
           if (!((c >= 0x0001) && (c <= 0x007F))) break;
           out.write(c);
        }
        
        for (;i < strlen; i++){
            c = str.charAt(i);
            if ((c >= 0x0001) && (c <= 0x007F)) {
            	out.write( c );
            } else if (c > 0x07FF) {
            	out.write(0xE0 | ((c >> 12) & 0x0F));
            	out.write(0x80 | ((c >>  6) & 0x3F));
            	out.write(0x80 | ((c >>  0) & 0x3F));
            } else {
            	out.write(0xC0 | ((c >>  6) & 0x1F));
            	out.write(0x80 | ((c >>  0) & 0x3F));
            }
        }
    }
    
    /**
     * Read Modified-UTF8 from a stream
     * @param in input
     * @param utflen number of bytes
     * @return string
     * @throws IOException
     */
    public static String readModifiedUTF(DataInput in, int utflen)
    throws IOException, UTFDataFormatException
    {
        if (utflen == 0)
            return "";

        // Copied from DataInput
        byte[] bytearr = null;
        char[] chararr = null;

        {
            bytearr = new byte[utflen];
            chararr = new char[utflen];
        }

        int c, char2, char3;
        int count = 0;
        int chararr_count=0;

        in.readFully(bytearr, 0, utflen);

        while (count < utflen) {
            c = (int) bytearr[count] & 0xff;      
            if (c > 127) break;
            count++;
            chararr[chararr_count++]=(char)c;
        }

        while (count < utflen) {
            c = (int) bytearr[count] & 0xff;
            switch (c >> 4) {
                case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
                    /* 0xxxxxxx*/
                    count++;
                    chararr[chararr_count++]=(char)c;
                    break;
                case 12: case 13:
                    /* 110x xxxx   10xx xxxx*/
                    count += 2;
                    if (count > utflen)
                        throw new UTFDataFormatException(
                            "malformed input: partial character at end");
                    char2 = (int) bytearr[count-1];
                    if ((char2 & 0xC0) != 0x80)
                        throw new UTFDataFormatException(
                            "malformed input around byte " + count); 
                    chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
                                                    (char2 & 0x3F));  
                    break;
                case 14:
                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
                    count += 3;
                    if (count > utflen)
                        throw new UTFDataFormatException(
                            "malformed input: partial character at end");
                    char2 = (int) bytearr[count-2];
                    char3 = (int) bytearr[count-1];
                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
                        throw new UTFDataFormatException(
                            "malformed input around byte " + (count-1));
                    chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
                                                    ((char2 & 0x3F) << 6)  |
                                                    ((char3 & 0x3F) << 0));
                    break;
                default:
                    /* 10xx xxxx,  1111 xxxx */
                    throw new UTFDataFormatException(
                        "malformed input around byte " + count);
            }
        }
        // The number of chars produced may be less than utflen
        return new String(chararr, 0, chararr_count);
    }    

    /**
     * Read Modified-UTF8 from a stream
     * @param bytearr input array of at least <code>toIndex</code> bytes
     * @param fromIndex starting offset of decoding
     * @param toIndex the first index excluded from decoding > fromIndex 
     * @return string
     * @throws IOException
     */
    public static String readModifiedUTF(byte[] bytearr, int fromIndex, int toIndex)
    throws UTFDataFormatException
    {
        // Adapted from DataInput
        int utflen = toIndex - fromIndex;

        // Optimization for empty strings
        if (utflen == 0)
            return "";

        char[] chararr = new char[utflen];
        int c, char2, char3;
        int count = fromIndex;
        int chararr_count=0;

        while (count < toIndex) {
            c = (int) bytearr[count] & 0xff;
            if (c > 127) break;
            count++;
            chararr[chararr_count++]=(char)c;
        }

        while (count < toIndex) {
            c = (int) bytearr[count] & 0xff;
            switch (c >> 4) {
                case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
                    /* 0xxxxxxx*/
                    count++;
                    chararr[chararr_count++]=(char)c;
                    break;
                case 12: case 13:
                    /* 110x xxxx   10xx xxxx*/
                    count += 2;
                    if (count > toIndex)
                        throw new UTFDataFormatException(
                            "malformed input: partial character at end");
                    char2 = (int) bytearr[count-1];
                    if ((char2 & 0xC0) != 0x80)
                        throw new UTFDataFormatException(
                            "malformed input around byte " + count); 
                    chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
                                                    (char2 & 0x3F));  
                    break;
                case 14:
                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
                    count += 3;
                    if (count > toIndex)
                        throw new UTFDataFormatException(
                            "malformed input: partial character at end");
                    char2 = (int) bytearr[count-2];
                    char3 = (int) bytearr[count-1];
                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
                        throw new UTFDataFormatException(
                            "malformed input around byte " + (count-1));
                    chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
                                                    ((char2 & 0x3F) << 6)  |
                                                    ((char3 & 0x3F) << 0));
                    break;
                default:
                    /* 10xx xxxx,  1111 xxxx */
                    throw new UTFDataFormatException(
                        "malformed input around byte " + count);
            }
        }
        // The number of chars produced may be less than utflen
        return new String(chararr, 0, chararr_count);
    }    

    /**
     * Write Standard-UTF8 to a stream.
     * 
     * @param str
     * @param out
     * @throws IOException
     */
    public static void writeUTF(DataOutput out, String str)
    throws IOException
    {
		byte[] bytes = str.getBytes(CHARSET);
		out.write(bytes);
    }
    
    /**
     * Read Standard-UTF8 from a stream
     * @param in input
     * @param len number of bytes
     * @return string
     * @throws IOException
     */
    public static String readUTF(DataInput in, int len)
    throws IOException
    {
		byte[] bytes = new byte[len];
		in.readFully(bytes);
		return new String(bytes, UTF8.CHARSET);
    }
	
}
