001 /*--------------------------------------------------------------------------+ 002 $Id: EByteOrderMark.java 29722 2010-08-16 13:40:26Z deissenb $ 003 | | 004 | Copyright 2005-2010 Technische Universitaet Muenchen | 005 | | 006 | Licensed under the Apache License, Version 2.0 (the "License"); | 007 | you may not use this file except in compliance with the License. | 008 | You may obtain a copy of the License at | 009 | | 010 | http://www.apache.org/licenses/LICENSE-2.0 | 011 | | 012 | Unless required by applicable law or agreed to in writing, software | 013 | distributed under the License is distributed on an "AS IS" BASIS, | 014 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 015 | See the License for the specific language governing permissions and | 016 | limitations under the License. | 017 +--------------------------------------------------------------------------*/ 018 package edu.tum.cs.commons.filesystem; 019 020 import java.util.Arrays; 021 022 import edu.tum.cs.commons.assertion.CCSMAssert; 023 import edu.tum.cs.commons.collections.ArrayUtils; 024 025 /** 026 * Enumeration of the UTF byte order marks (BOM). The actual values are taken 027 * from http://unicode.org/faq/utf_bom.html 028 * <p> 029 * The order of the values in this enum is chosen such that BOMs that are a 030 * prefix of other BOMs are at the end, i.e. UTF-32 is before UTF-16. This way 031 * we can check the BOM prefix in the order of the enum values' appearance. 032 * 033 * @author hummelb 034 * @author $Author: deissenb $ 035 * @version $Rev: 29722 $ 036 * @levd.rating GREEN Hash: 2AAB6CBCE60BACE98E4803B711962593 037 */ 038 public enum EByteOrderMark { 039 040 /** UTF-32 with big endian encoding. */ 041 UTF_32BE("UTF-32BE", new byte[] { 0x00, 0x00, (byte) 0xFE, (byte) 0xFF }), 042 043 /** UTF-32 with little endian encoding. */ 044 UTF_32LE("UTF-32LE", new byte[] { (byte) 0xFF, (byte) 0xFE, 0x00, 0x00 }), 045 046 /** UTF-16 with big endian encoding. */ 047 UTF_16BE("UTF-16BE", new byte[] { (byte) 0xFE, (byte) 0xFF }), 048 049 /** UTF-16 with little endian encoding. */ 050 UTF_16LE("UTF-16LE", new byte[] { (byte) 0xFF, (byte) 0xFE }), 051 052 /** 053 * UTF-8. Note that for UTF-8 the endianess is not relevant and that the BOM 054 * is optional. 055 */ 056 UTF_8_BOM("UTF-8", new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }); 057 058 /** The maximal length of a BOM. */ 059 public static final int MAX_BOM_LENGTH = 4; 060 061 /** The name of the encoding */ 062 private final String encoding; 063 064 /** The byte order mark. */ 065 private final byte[] bom; 066 067 /** Constructor. */ 068 private EByteOrderMark(String encoding, byte[] bom) { 069 this.encoding = encoding; 070 CCSMAssert.isTrue(bom.length <= MAX_BOM_LENGTH, 071 "Inconsistent max BOM length!"); 072 this.bom = bom; 073 } 074 075 /** Returns the encoding. */ 076 public String getEncoding() { 077 return encoding; 078 } 079 080 /** 081 * Returns the byte order mark. This returns a copy, so the array may be 082 * modified. 083 */ 084 public byte[] getBOM() { 085 return Arrays.copyOf(bom, bom.length); 086 } 087 088 /** Returns the size of the BOM in bytes. */ 089 public int getBOMLength() { 090 return bom.length; 091 } 092 093 /** 094 * This method checks the start of the provided data array to find a BOM. If 095 * a BOM is found, the corresponding enum value is returned. Otherwise, 096 * <code>null</code> is returned. If possible, the provided data should at 097 * least be of size {@value #MAX_BOM_LENGTH}. Otherwise the encoding might 098 * not be detected correctly. However, the method also works with shorter 099 * arrays (e.g. if a file consists of only 3 bytes). 100 */ 101 public static EByteOrderMark determineBOM(byte[] data) { 102 for (EByteOrderMark bom : values()) { 103 if (ArrayUtils.isPrefix(bom.bom, data)) { 104 return bom; 105 } 106 } 107 return null; 108 } 109 }