【C#】ファイルの文字コードを取得・判別する

ファイルの文字コードを判別する方法を紹介します。

符号化形式（符号化スキーム）	エンディアンの区別	バイト順マーク (BOM)
UTF-8		0xEF 0xBB 0xBF（なおBOM無しはUTF-8Nと呼ばれることがある）
UTF-16	BE	0xFE 0xFF
LE	0xFF 0xFE
UTF-16BE		（付加は認められない）
UTF-16LE		（付加は認められない）
UTF-32	BE	0x00 0x00 0xFE 0xFF
LE	0xFF 0xFE 0x00 0x00
UTF-32BE		（付加は認められない）
UTF-32LE		（付加は認められない）
UTF-7		0x2B 0x2F 0x76 ※ （※は次のバイトの値によって異なり、0x38、0x39、0x2B、0x2Fのいずれかがくる）

BOMについては下記を参考にしています。

バイト順マーク - Wikipedia

ファイル先頭のBOMを読み取って文字コードの判別を行います。

/// <summary>
/// 指定したファイルのエンコーディングを判別して取得します。
/// </summary>
/// <param name="filename"></param>
/// <returns></returns>
public static Encoding GetEncoding(string filename)
{
    // BOMを取得
    var bom = new byte[4];
    using (var file = new FileStream(filename, FileMode.Open, FileAccess.Read))
    {
        file.Read(bom, 0, 4);
    }

    // BOMを解析
    if (bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76) return Encoding.UTF7;                             // UTF-7
    if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) return Encoding.UTF8;                             // UTF-8
    if (bom[0] == 0xff && bom[1] == 0xfe) return Encoding.Unicode;                                            // UTF-16LE
    if (bom[0] == 0xfe && bom[1] == 0xff) return Encoding.BigEndianUnicode;                                   // UTF-16BE
    if (bom[0] == 0xff && bom[1] == 0xfe && bom[2] == 0x00 && bom[3] == 0x00) return Encoding.Unicode;        // UTF-32LE
    if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) return new UTF32Encoding(true, true); // UTF-32BE
    return Encoding.ASCII;
}

/// <summary>

/// 指定したファイルのエンコーディングを判別して取得します。

/// </summary>

/// <param name="filename"></param>

/// <returns></returns>

public static Encoding GetEncoding(string filename)

{

// BOMを取得

var bom = new byte[4];

using (var file = new FileStream(filename, FileMode.Open, FileAccess.Read))

{

file.Read(bom, 0, 4);

}

// BOMを解析

if (bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76) return Encoding.UTF7; // UTF-7

if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) return Encoding.UTF8; // UTF-8

if (bom[0] == 0xff && bom[1] == 0xfe) return Encoding.Unicode; // UTF-16LE

if (bom[0] == 0xfe && bom[1] == 0xff) return Encoding.BigEndianUnicode; // UTF-16BE

if (bom[0] == 0xff && bom[1] == 0xfe && bom[2] == 0x00 && bom[3] == 0x00) return Encoding.Unicode; // UTF-32LE

if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) return new UTF32Encoding(true, true); // UTF-32BE

return Encoding.ASCII;

}

ファイルの文字コードを判別してEncodingオブジェクトを取得するには以下の様に使用します。

var enc = GetEncoding(@"C:\Foo\bar.txt");

1	var enc = GetEncoding(@"C:\Foo\bar.txt");