超簡単なZIPアーカイバ

圧縮率とか速度とかを度外視して、どの程度の行数でZIPのアーカイバを作成できるかどうか検証してみた。

ZIPの仕様書(英語)を流し読みしながら実装。分からないところは、手元で作った複数のZIPファイルをバイナリエディタで眺めながら解析。

正直、version made byは表があるのものの、数字が16進数なのか、10進数なのかわかりにくいし、実際のzipファイルには違う値が入っているし、いったい、これは何なのか分からなかった。

internal file attributeは、何のために存在するのか分かりません。crc-32も、元のファイルに対するCRCであることとかは明示してほしい。あと、似たような名前の構造が多い上に、説明が分散しすぎて読みにくい。

機種依存部分なんて、他の実装を見るしかないだろうという感じ。external file attributeにGetFileAttributesの結果をそのままぶち込んで良いのなんて、何となくそんな気もするけど、explzhの出力を見るまで確信が持てなかった。

あとはねぇ、ファイル名をUTF-8を持つことが出来るような拡張が最近の仕様で行われているのだけど、Mac OS Xとか、Linuxでのアーカイバでは当然使われていないし、Windowsでもどのアーカイバなら対応しているのかはかなり不明。explzhは対応しているみたいというか、Mac OS Xで作ったZIPのUTF-8をgeneral purpose bit flagなしで正しくデコードできている。これはひょっとして、リソースフォーク系のファイルを検出した時点でUTF-8に決めうちしているのだろうか?

とか、いろいろ解析したりしているけど、現実的には、#ziplibLGPLで利用できるので、現実的には、自分で作る必要はないのだけども、外部ライブラリを一切利用せずにC#でZIPを作ってみたかったのが理由。

残念なのは、.NET Frameworkには、CRC32の計算ルーチンが含まれていないので、それだけは外部からもらってくる必要があったこと。ライセンス、リファレンス用のコメントをのぞけば、実質、400行程度。さらに、CRC32のクラスを除けば、320行ぐらい。まぁ、ファイルの相対パスチェックや、親ディレクトリの走査などの実質的なヘルパーを除けば本体はきわめて小さい。これだけのコードでZIPが出来るという意味ではおもしろいと思う。

ただ、圧縮率は、さすがにその辺の現実的なアーカイバよりは低め。こればっかりは、.NET FrameworkDeflateStreamでは限界があると言うことか。DeflateStreamはパラメータ類の調整が一切出来ないのがつらい。

とはいえ、これだけのコードでZIPが作れるというのはある意味うれしい。Visual Studio 2008で最適化をかけてコードを出力したら、たった11KBだった。この程度ならば、Silverlightアプリとかでも利用できるねぇ。

使い方は、

> SimpleZipArc 出力先.zip ディレクトリ

みたいな感じ。第二引数以降はすべて圧縮するファイル群と見なされる。

一応、BSDライセンスですが、現実的なところでは好きにしてください。単に他人に渡すときにはBSDってことにしてねという程度。

using System;
using System.IO;
using System.IO.Compression;
using System.Text;
using System.Collections.Generic;

namespace SimpleZipArc
{
 /// <summary>
 /// Simple ZIP Archiver
 /// </summary>
 public sealed class ZipArc : IDisposable
 {
  /// <summary>
  /// Sample Routine
  /// </summary>
  public static void Main(string[] args)
  {
   using (ZipArc za = new ZipArc(args[0]))
   {
    za.BasePath = Directory.GetCurrentDirectory();
    for (int i = 1; i < args.Length; i++)
     za.AddFile(args[i]);
   }
  }

  /// <summary>
  /// Initializes a ZIP archiver with output zip file name.
  /// </summary>
  /// <param name="zipFileName">Path of the output ZIP file.</param>
  public ZipArc(string zipFileName)
  {
   m_fs = File.Open(zipFileName, FileMode.Create, FileAccess.ReadWrite);
  }

  /// <summary>
  /// Base directory path, to which archived file names are make relative.
  /// </summary>
  public string BasePath
  {
   get { return m_basePath; }
   set
   {
    m_basePath = Path.GetFullPath(value);
   }
  }

  /// <summary>
  /// Add a file/directory to the archive.
  /// </summary>
  /// <param name="fileName">Path to a file/directory to archive.</param>
  /// <exception cref="ArgumentException">
  /// The file/directory is outside the <see cref="ZipArc.BasePath"/>.
  /// </exception>
  public void AddFile(string fileName)
  {
   if (!AddFileInternal(fileName, GenerateInternalFileName(fileName)))
    return;

   if (Directory.Exists(fileName))
   {
    foreach (string fn in Directory.GetDirectories(fileName, "*", SearchOption.AllDirectories))
     AddFile(fn);

    foreach (string fn in Directory.GetFiles(fileName, "*", SearchOption.AllDirectories))
     AddFile(fn);
   }
  }

  /// <summary>
  /// Finalizes archiving.
  /// </summary>
  public void Dispose()
  {
   WriteCentralDirectory();

   if (m_fs != null)
   {
    m_fs.Dispose();
    m_fs = null;
   }
   GC.SuppressFinalize(this);
  }

  private bool AddFileInternal(string fileName, string iname)
  {
   if (string.IsNullOrEmpty(iname))
    return true;

   // duplication check
   if (m_fileSet.Contains(iname))
    return false;

   FileEntry fe = new FileEntry(fileName, iname);
   fe.WriteLocalFileHeader(m_fs);
   m_entries.Add(fe);
   m_fileSet.Add(iname);
   return true;
  }

  private string GenerateInternalFileName(string fileName)
  {
   string iname = Path.GetFullPath(fileName);
   if (iname == m_basePath)
    return string.Empty;

   if (!iname.StartsWith(m_basePath))
    throw new ArgumentException("The file is not in the base directory!", "fileName");

   // make the file name relative to the base path
   iname = iname.Substring(m_basePath.Length + 1).Replace(Path.DirectorySeparatorChar, '/');

   // confirms all the ascending directories exist
   string[] comps = iname.Split('/');
   string dir = string.Empty;
   for (int i = 0; i < comps.Length - 1; i++)
   {
    dir += comps[i] + "/";
    if (!m_fileSet.Contains(dir))
     AddFileInternal(Path.GetFullPath(m_basePath + "/" + dir), dir);
   }

   // directory should suffixed by /
   if (Directory.Exists(fileName))
    iname += "/";

   return iname;
  }

  private void WriteCentralDirectory()
  {
   long offset = m_fs.Position;
   foreach (FileEntry fe in m_entries)
    fe.WriteCentralDirectoryStructureEntry(m_fs);
   long size = m_fs.Position - offset;
   m_fs.WriteUInt32(0x06054b50U);
   m_fs.WriteUInt16(0);
   m_fs.WriteUInt16(0);
   m_fs.WriteUInt16((ushort)m_entries.Count);
   m_fs.WriteUInt16((ushort)m_entries.Count);
   m_fs.WriteUInt32((uint)size);
   m_fs.WriteUInt32((uint)offset);
   m_fs.WriteUInt16(0);
  }

  private Stream m_fs;
  private string m_basePath;
  private List<FileEntry> m_entries = new List<FileEntry>();
  private HashSet<string> m_fileSet = new HashSet<string>();

  /// <summary>
  /// This class manages a file/directory.
  /// </summary>
  private sealed class FileEntry
  {
   /// <summary>
   /// Initializes an entry for a file.
   /// </summary>
   /// <param name="fileName">Path to a file/directory to the archive.</param>
   /// <param name="nameToStore">File path, which is stored in the archive.</param>
   public FileEntry(string fileName, string nameToStore)
   {
    m_fi = new FileInfo(fileName);
    m_name = nameToStore;
    m_compressedSize = IsDirectory ? 0U : (uint)m_fi.Length;

   }

   /// <summary>
   /// Write local file header.
   /// </summary>
   /// <param name="s">Stream to write on.</param>
   public void WriteLocalFileHeader(Stream s)
   {
    WriteEntry(s, true);
   }

   /// <summary>
   /// Write file header on central directory structure.
   /// </summary>
   /// <param name="s">Stream to write on.</param>
   public void WriteCentralDirectoryStructureEntry(Stream s)
   {
    WriteEntry(s, false);
   }

   /// <summary>
   /// Write local file header or file header.
   /// </summary>
   /// <param name="s">Stream to write on.</param>
   /// <param name="forLocalFileHeader">To write local file header,
   /// this should be true; otherwise false.</param>
   private void WriteEntry(Stream s, bool forLocalFileHeader)
   {
    if (forLocalFileHeader)
    {
     if (m_bodyWritten)
      throw new ApplicationException("File is already archived.");
    }
    else
    {
     if (!m_bodyWritten)
      throw new ApplicationException("File is not archived!");
    }

    System.Diagnostics.Trace.WriteLine(m_name);

    bool needCompress = IsToCompress;
    if (forLocalFileHeader) m_offset = s.Position;
    s.WriteUInt32(forLocalFileHeader ? 0x04034B50U : 0x02014B50);
    if (!forLocalFileHeader)
     s.WriteUInt16(0x2D); // made by (I don't know the actual meaning of it)
    s.WriteUInt16(IsDirectory ? 0xAU : 0x14U); // req version to extract
    s.WriteUInt16(UseUTF8 ? (1U << 11) : 0U); // Unicode flag (names are in UTF-8)
    s.WriteUInt16(needCompress ? 8U : 0U); // Deflate or not
    s.WriteUInt16(DosTime);
    s.WriteUInt16(DosDate);
    s.WriteUInt32(m_crc32); // CRC32 @(offset + 14)
    s.WriteUInt32((uint)m_compressedSize);
    s.WriteUInt32(IsDirectory ? 0U : (uint)m_fi.Length);

    byte[] nameBin = StringToBytes(m_name);
    byte[] extra = new byte[0]; // empty on this code
    s.WriteUInt16((uint)nameBin.Length);
    s.WriteUInt16((uint)extra.Length);

    if (!forLocalFileHeader)
    {
     s.WriteUInt16(0); // file comment length
     s.WriteUInt16(0); // disk no.
     s.WriteUInt16(0); // internal file attribute
     s.WriteUInt32((uint)m_fi.Attributes); // external file attribute
     s.WriteUInt32((uint)m_offset);
    }

    s.Write(nameBin);
    s.Write(extra);
    
    if (!forLocalFileHeader)
     return;

    if (IsDirectory)
    {
     m_bodyWritten = true;
     return;
    }

    long dataPos = s.Position;
    Stream ws = null, ds = null;
    if (needCompress)
     ws = ds = new DeflateStream(s, CompressionMode.Compress, true);
    else
     ws = s;
    try
    {
     byte[] buf = new byte[1024 * 1024];
     Crc32 crc32 = new Crc32();
     using (Stream fs = m_fi.OpenRead())
     {
      for (; ; )
      {
       int len = fs.Read(buf, 0, buf.Length);
       if (len == 0)
        break;

       ws.Write(buf, 0, len);
       crc32.Update(buf, 0, len);
      }
     }
     m_crc32 = crc32.Value;
    }
    finally
    {
     if (ds != null)
      ds.Dispose();
    }
    long lastPos = s.Position;
    m_compressedSize = lastPos - dataPos;

    // write CRC32 and the compressed size
    s.Position = m_offset + 14;
    s.WriteUInt32(m_crc32);
    s.WriteUInt32((uint)m_compressedSize);
    s.Position = lastPos;

    m_bodyWritten = true;
   }

   private FileInfo m_fi;
   private string m_name;
   private long m_offset = 0;
   private uint m_crc32 = 0;
   private long m_compressedSize = 0;
   private bool m_bodyWritten = false;

   public bool IsDirectory
   {
    get
    {
     if ((m_fi.Attributes & FileAttributes.Directory) != 0)
      return true;
     return false;
    }
   }

   public ushort DosDate
   {
    get
    {
     DateTime dt = m_fi.LastWriteTime;
     return (ushort)(((dt.Year - 1980) << 9) | (dt.Month << 5) | dt.Day);
    }
   }

   public ushort DosTime
   {
    get
    {
     DateTime dt = m_fi.LastWriteTime;
     return (ushort)((dt.Hour << 11) | (dt.Minute << 5) | (dt.Second >> 1));
    }
   }

   /// <summary>
   /// Whether or not to compress the file.
   /// </summary>
   /// <remarks>
   /// Dicision is made regarding the extension, file size and other
   /// file properties.</remarks>
   private bool IsToCompress
   {
    get
    {
     if (IsDirectory)
      return false;

     // The file is too small to compress; compression may
     // make the file larger.
     if (m_fi.Length < 256)
      return false;
     
     // Some file types are natively compressed and deflate cannot
     // compress them effeciently; no compression is a good choise
     // for such files.
     string ext = Path.GetExtension(m_name).ToLower();
     switch (ext)
     {
      case ".jpg":
      case ".zip":
      case ".docx":
      case ".xlsx":
      case ".pptx":
       return false;
      default:
       return true;
     }
    }
   }

   private byte[] StringToBytes(string str)
   {
    if(UseUTF8)
     return Encoding.UTF8.GetBytes(str);
    return Encoding.Default.GetBytes(str); // use the locale setting rather than UTF-8
   }

   /// <summary>
   /// Prefers UTF-8 for file name encoding.
   /// </summary>
   public bool UseUTF8 = true;
  }
 }

 /// <summary>
 /// BinaryWriter like extension methods for Stream class.
 /// </summary>
 public static class BinaryWriterStreamHelper
 {
  public static void Write(this Stream s, byte[] data)
  {
   s.Write(data, 0, data.Length);
  }

  public static void WriteUInt32(this Stream s, uint v)
  {
   s.Write(BitConverter.GetBytes(v));
  }

  public static void WriteUInt16(this Stream s, uint v)
  {
   s.Write(BitConverter.GetBytes((ushort)v));
  }
 }

 /// <summary>
 /// CRC32 class modified from Mike Krueger's code.
 /// </summary>
 public sealed class Crc32
 {
  public Crc32()
  {
   Reset();
  }

  /// <summary>
  /// Returns the CRC32 data checksum computed so far.
  /// </summary>
  public uint Value
  {
   get { return ~m_crc; }
  }

  /// <summary>
  /// Resets the CRC32 data checksum as if no update was ever called.
  /// </summary>
  public void Reset()
  {
   m_crc = ~0U;
  }

  /// <summary>
  /// Adds the byte array to the data checksum.
  /// </summary>
  /// <param name = "buf">
  /// the buffer which contains the data.
  /// </param>
  /// <param name = "off">
  /// the offset in the buffer where the data starts.
  /// </param>
  /// <param name = "len">
  /// the length of the data.
  /// </param>
  public void Update(byte[] buf, int off, int len)
  {
   if (buf == null)
    throw new ArgumentNullException("buf");

   if (off < 0 || len < 0 || off + len > buf.Length)
    throw new ArgumentOutOfRangeException();

   while (--len >= 0)
    m_crc = s_crcTable[(m_crc ^ buf[off++]) & 0xFF] ^ (m_crc >> 8);
  }

  private uint m_crc;

  private readonly static uint[] s_crcTable = new uint[]
  {
   0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419,
   0x706AF48F, 0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4,
   0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07,
   0x90BF1D91, 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE,
   0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 0x136C9856,
   0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9,
   0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4,
   0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
   0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3,
   0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 0x26D930AC, 0x51DE003A,
   0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599,
   0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
   0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190,
   0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F,
   0x9FBFE4A5, 0xE8B8D433, 0x7807C9A2, 0x0F00F934, 0x9609A88E,
   0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
   0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED,
   0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
   0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3,
   0xFBD44C65, 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2,
   0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A,
   0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5,
   0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA, 0xBE0B1010,
   0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
   0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17,
   0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6,
   0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615,
   0x73DC1683, 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
   0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 0xF00F9344,
   0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
   0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A,
   0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
   0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1,
   0xA6BC5767, 0x3FB506DD, 0x48B2364B, 0xD80D2BDA, 0xAF0A1B4C,
   0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF,
   0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
   0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE,
   0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31,
   0x2CD99E8B, 0x5BDEAE1D, 0x9B64C2B0, 0xEC63F226, 0x756AA39C,
   0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
   0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B,
   0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
   0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1,
   0x18B74777, 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C,
   0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, 0xA00AE278,
   0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7,
   0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66,
   0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
   0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605,
   0xCDD70693, 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8,
   0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 
   0x2D02EF8D
  };
 }
}