001    /*--------------------------------------------------------------------------+
002    $Id: EByteOrderMark.java 29722 2010-08-16 13:40:26Z deissenb $
003    |                                                                          |
004    | Copyright 2005-2010 Technische Universitaet Muenchen                     |
005    |                                                                          |
006    | Licensed under the Apache License, Version 2.0 (the "License");          |
007    | you may not use this file except in compliance with the License.         |
008    | You may obtain a copy of the License at                                  |
009    |                                                                          |
010    |    http://www.apache.org/licenses/LICENSE-2.0                            |
011    |                                                                          |
012    | Unless required by applicable law or agreed to in writing, software      |
013    | distributed under the License is distributed on an "AS IS" BASIS,        |
014    | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
015    | See the License for the specific language governing permissions and      |
016    | limitations under the License.                                           |
017    +--------------------------------------------------------------------------*/
018    package edu.tum.cs.commons.filesystem;
019    
020    import java.util.Arrays;
021    
022    import edu.tum.cs.commons.assertion.CCSMAssert;
023    import edu.tum.cs.commons.collections.ArrayUtils;
024    
025    /**
026     * Enumeration of the UTF byte order marks (BOM). The actual values are taken
027     * from http://unicode.org/faq/utf_bom.html
028     * <p>
029     * The order of the values in this enum is chosen such that BOMs that are a
030     * prefix of other BOMs are at the end, i.e. UTF-32 is before UTF-16. This way
031     * we can check the BOM prefix in the order of the enum values' appearance.
032     * 
033     * @author hummelb
034     * @author $Author: deissenb $
035     * @version $Rev: 29722 $
036     * @levd.rating GREEN Hash: 2AAB6CBCE60BACE98E4803B711962593
037     */
038    public enum EByteOrderMark {
039    
040            /** UTF-32 with big endian encoding. */
041            UTF_32BE("UTF-32BE", new byte[] { 0x00, 0x00, (byte) 0xFE, (byte) 0xFF }),
042    
043            /** UTF-32 with little endian encoding. */
044            UTF_32LE("UTF-32LE", new byte[] { (byte) 0xFF, (byte) 0xFE, 0x00, 0x00 }),
045    
046            /** UTF-16 with big endian encoding. */
047            UTF_16BE("UTF-16BE", new byte[] { (byte) 0xFE, (byte) 0xFF }),
048    
049            /** UTF-16 with little endian encoding. */
050            UTF_16LE("UTF-16LE", new byte[] { (byte) 0xFF, (byte) 0xFE }),
051    
052            /**
053             * UTF-8. Note that for UTF-8 the endianess is not relevant and that the BOM
054             * is optional.
055             */
056            UTF_8_BOM("UTF-8", new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
057    
058            /** The maximal length of a BOM. */
059            public static final int MAX_BOM_LENGTH = 4;
060    
061            /** The name of the encoding */
062            private final String encoding;
063    
064            /** The byte order mark. */
065            private final byte[] bom;
066    
067            /** Constructor. */
068            private EByteOrderMark(String encoding, byte[] bom) {
069                    this.encoding = encoding;
070                    CCSMAssert.isTrue(bom.length <= MAX_BOM_LENGTH,
071                                    "Inconsistent max BOM length!");
072                    this.bom = bom;
073            }
074    
075            /** Returns the encoding. */
076            public String getEncoding() {
077                    return encoding;
078            }
079    
080            /**
081             * Returns the byte order mark. This returns a copy, so the array may be
082             * modified.
083             */
084            public byte[] getBOM() {
085                    return Arrays.copyOf(bom, bom.length);
086            }
087    
088            /** Returns the size of the BOM in bytes. */
089            public int getBOMLength() {
090                    return bom.length;
091            }
092    
093            /**
094             * This method checks the start of the provided data array to find a BOM. If
095             * a BOM is found, the corresponding enum value is returned. Otherwise,
096             * <code>null</code> is returned. If possible, the provided data should at
097             * least be of size {@value #MAX_BOM_LENGTH}. Otherwise the encoding might
098             * not be detected correctly. However, the method also works with shorter
099             * arrays (e.g. if a file consists of only 3 bytes).
100             */
101            public static EByteOrderMark determineBOM(byte[] data) {
102                    for (EByteOrderMark bom : values()) {
103                            if (ArrayUtils.isPrefix(bom.bom, data)) {
104                                    return bom;
105                            }
106                    }
107                    return null;
108            }
109    }