ドキュメントからテキストのみを抽出する

ちょっと思うところがあって、テキストマイニングに挑戦中。
まずは、対象となるテキストを集める必要があるのだが、多くのドキュメントはWordやExcel、PDFで保存さているのでこれらのドキュメントからテキストを抽出する必要がある。


そこで、http://sqljunkies.com/HowTo/C4AC6E97-8D84-411D-8551-08CE63EC99B6.scuk
ここを参考に、IFilterを使用して、テキストを抽出するプログラムを書いてみた。

もとのプログラムを動かしてみたところ、ちょっとバグっぽいところがあったのでそれを修正したのと、
PDFやHTMLにも対応できるようにしてみた。

ちなみに、IFilterのGUIDは、
http://www.citeknet.com/Products/IFilters/IFilterExplorer/tabid/62/Default.aspx
このツールで調べることができる。

using System;
using System.Text;
using System.Runtime.InteropServices;
namespace StemText
{
    [Flags]
    public enum IFILTER_INIT
    {
        NONE                   = 0,
        CANON_PARAGRAPHS       = 1,
        HARD_LINE_BREAKS       = 2,
        CANON_HYPHENS          = 4,
        CANON_SPACES           = 8,
        APPLY_INDEX_ATTRIBUTES = 16,
        APPLY_CRAWL_ATTRIBUTES = 256,
        APPLY_OTHER_ATTRIBUTES = 32,
        INDEXING_ONLY          = 64,
        SEARCH_LINKS           = 128,        
        FILTER_OWNED_VALUE_OK  = 512
    }
    [Flags]
    public enum IFILTER_FLAGS
    {
        OLE_PROPERTIES = 1
    }
    public enum CHUNK_BREAKTYPE
    {
        CHUNK_NO_BREAK = 0,
        CHUNK_EOW      = 1,
        CHUNK_EOS      = 2,
        CHUNK_EOP      = 3,
        CHUNK_EOC      = 4
    }
    [Flags]
    public enum CHUNKSTATE
    {
        CHUNK_TEXT               = 0x1,
        CHUNK_VALUE              = 0x2,
        CHUNK_FILTER_OWNED_VALUE = 0x4
    }
    public enum PSKIND
    {
        LPWSTR = 0,
        PROPID = 1
    }
      [StructLayout(LayoutKind.Sequential)]
      public struct PROPSPEC
      {
            public uint ulKind;
            public uint propid;
            public IntPtr lpwstr;
      }
      [StructLayout(LayoutKind.Sequential)]
      public struct FULLPROPSPEC
      {
            public Guid guidPropSet;
            public PROPSPEC psProperty;
      }
    [StructLayout(LayoutKind.Sequential)]
    public struct STAT_CHUNK
    {
                                          public uint  idChunk;
        [MarshalAs(UnmanagedType.U4)]     public CHUNK_BREAKTYPE breakType;
        [MarshalAs(UnmanagedType.U4)]     public CHUNKSTATE flags;
                                          public uint locale;
        [MarshalAs(UnmanagedType.Struct)] public FULLPROPSPEC attribute;
                                          public uint idChunkSource;
                                          public uint cwcStartSource;
                                          public uint cwcLenSource;
    }
    [StructLayout(LayoutKind.Sequential)]
    public struct FILTERREGION
    {
        public uint idChunk;
        public uint cwcStart;
        public uint cwcExtent;
    }
    [ComImport]
    [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
    [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
    public interface IFilter
    {
        void Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags, 
                  uint cAttributes,
                  [MarshalAs(UnmanagedType.LPArray, SizeParamIndex=1)] FULLPROPSPEC[] aAttributes,
                  ref uint pdwFlags);
            //void GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat);
        [PreserveSig] int GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat);
            [PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer);
        void GetValue(ref UIntPtr ppPropValue);
        void BindRegion([MarshalAs(UnmanagedType.Struct)]FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk);
    }
      [ComImport]
      [Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")]
      public class OfficeFilter
      {
      }

      [ComImport]
      [Guid("4c904448-74a9-11d0-af6e-00c04fd8dc02")]
      public class PDFFilter
      {
      }

      [ComImport]
      [Guid("faea5b46-761b-400e-b53e-e805a97a543e")]
      public class VisioFilter
      {
      }

      [ComImport]
      [Guid("e0ca5340-4534-11cf-b952-00aa0051fe20")]
      public class HtmlFilter
      {
      }

      [ComImport]
      [Guid("c1243ca0-bf96-11cd-b579-08002b30bfeb")]
      public class PlainTextFilter
      {
      }

    public class Constants
    {
        public const uint PID_STG_DIRECTORY               =0x00000002;
        public const uint PID_STG_CLASSID                 =0x00000003;
        public const uint PID_STG_STORAGETYPE             =0x00000004;
        public const uint PID_STG_VOLUME_ID               =0x00000005;
        public const uint PID_STG_PARENT_WORKID           =0x00000006;
        public const uint PID_STG_SECONDARYSTORE          =0x00000007;
        public const uint PID_STG_FILEINDEX               =0x00000008;
        public const uint PID_STG_LASTCHANGEUSN           =0x00000009;
        public const uint PID_STG_NAME                    =0x0000000a;
        public const uint PID_STG_PATH                    =0x0000000b;
        public const uint PID_STG_SIZE                    =0x0000000c;
        public const uint PID_STG_ATTRIBUTES              =0x0000000d;
        public const uint PID_STG_WRITETIME               =0x0000000e;
        public const uint PID_STG_CREATETIME              =0x0000000f;
        public const uint PID_STG_ACCESSTIME              =0x00000010;
        public const uint PID_STG_CHANGETIME              =0x00000011;
        public const uint PID_STG_CONTENTS                =0x00000013;
        public const uint PID_STG_SHORTNAME               =0x00000014;
        public const int  FILTER_E_END_OF_CHUNKS          =(unchecked((int)0x80041700));
        public const int  FILTER_E_NO_MORE_TEXT           =(unchecked((int)0x80041701));
        public const int  FILTER_E_NO_MORE_VALUES         =(unchecked((int)0x80041702));
        public const int  FILTER_E_NO_TEXT                =(unchecked((int)0x80041705));
        public const int  FILTER_E_NO_VALUES              =(unchecked((int)0x80041706));
        public const int  FILTER_S_LAST_TEXT              =(unchecked((int)0x00041709));
    }
}
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.InteropServices;
using System.Reflection;
using System.Text;

namespace StemText
{
    public class FilterFactory
    {
        private static Dictionary<string, Type> filterMap;

        static FilterFactory()
        {
            filterMap = new Dictionary<string, Type>();
            filterMap.Add(".doc",typeof(OfficeFilter));
            filterMap.Add(".xls", typeof(OfficeFilter));
            filterMap.Add(".ppt", typeof(OfficeFilter));

            filterMap.Add(".vsd", typeof(VisioFilter));

            filterMap.Add(".pdf", typeof(PDFFilter));

            filterMap.Add(".htm", typeof(HtmlFilter));
            filterMap.Add(".html", typeof(PDFFilter));

        }

        private FilterFactory()
        {
        }

        public static IFilter GetInstance(string filePath)
        {
            if (string.IsNullOrEmpty(filePath))
            {
                throw new ArgumentNullException();
            }

            FileInfo fi = new FileInfo(filePath);
            if (!fi.Exists)
            {
                throw new ArgumentException();
            }

            Type t = null;
            try
            {
                t = filterMap[fi.Extension.ToLower()];
            }
            catch (KeyNotFoundException ex)
            {
                t = typeof(PlainTextFilter);
            }


            //object o = t.GetConstructor(null).Invoke(null);
            object o = t.GetConstructor(new Type[0]).Invoke(new object[0]);
            IFilter filter = (IFilter)o;

            UCOMIPersistFile ipf = (UCOMIPersistFile) filter;
            ipf.Load(filePath, 0);
            
            return filter;
        }
    }
}
using System;
using System.Collections.Generic;
using System.Text;

namespace StemText
{
    public class FilterUtil
    {
        private FilterUtil()
        { }

        public static string GetText(IFilter filter)
        {
            StringBuilder sbResult = new StringBuilder();
            uint i = 0;
            STAT_CHUNK ps = new STAT_CHUNK();
            filter.Init(0, 0, null, ref i);
            int hr = 0;//HRESULT
            while (hr == 0)
            {
                hr = filter.GetChunk(out ps);
                if (ps.flags == CHUNKSTATE.CHUNK_TEXT)
                {
                    uint pcwcBuffer = 1000;
                    int hr2 = 0;
                    while (hr2 == Constants.FILTER_S_LAST_TEXT || hr2 == 0)
                    {
                        pcwcBuffer = 1000;
                        StringBuilder sbBuffer = new StringBuilder((int)pcwcBuffer);
                        hr2 = filter.GetText(ref pcwcBuffer, sbBuffer);
                        if (hr2 != Constants.FILTER_E_NO_TEXT && hr2 != Constants.FILTER_E_NO_MORE_TEXT)
                        {
                            string text = sbBuffer.ToString(0, (int)pcwcBuffer);
                            sbResult.Append(text);
                        }
                    }
                }
            }

            return sbResult.ToString();
        }
    }
}