ドキュメントからテキストのみを抽出する
先日の修正版
http://d.hatena.ne.jp/m-tanaka/20080129
FilterがNULL文字列を返却したさいに、StringBuilderだとうまく扱えないので、少し修正
//参考 //http://sqljunkies.com/HowTo/C4AC6E97-8D84-411D-8551-08CE63EC99B6.scuk using System; using System.Text; using System.Runtime.InteropServices; namespace Yuki { [Flags] public enum IFILTER_INIT { NONE = 0, CANON_PARAGRAPHS = 1, HARD_LINE_BREAKS = 2, CANON_HYPHENS = 4, CANON_SPACES = 8, APPLY_INDEX_ATTRIBUTES = 16, APPLY_CRAWL_ATTRIBUTES = 256, APPLY_OTHER_ATTRIBUTES = 32, INDEXING_ONLY = 64, SEARCH_LINKS = 128, FILTER_OWNED_VALUE_OK = 512 } [Flags] public enum IFILTER_FLAGS { OLE_PROPERTIES = 1 } public enum CHUNK_BREAKTYPE { CHUNK_NO_BREAK = 0, CHUNK_EOW = 1, CHUNK_EOS = 2, CHUNK_EOP = 3, CHUNK_EOC = 4 } [Flags] public enum CHUNKSTATE { CHUNK_TEXT = 0x1, CHUNK_VALUE = 0x2, CHUNK_FILTER_OWNED_VALUE = 0x4 } public enum PSKIND { LPWSTR = 0, PROPID = 1 } [StructLayout(LayoutKind.Sequential)] public struct PROPSPEC { public uint ulKind; public uint propid; public IntPtr lpwstr; } [StructLayout(LayoutKind.Sequential)] public struct FULLPROPSPEC { public Guid guidPropSet; public PROPSPEC psProperty; } [StructLayout(LayoutKind.Sequential)] public struct STAT_CHUNK { public uint idChunk; [MarshalAs(UnmanagedType.U4)] public CHUNK_BREAKTYPE breakType; [MarshalAs(UnmanagedType.U4)] public CHUNKSTATE flags; public uint locale; [MarshalAs(UnmanagedType.Struct)] public FULLPROPSPEC attribute; public uint idChunkSource; public uint cwcStartSource; public uint cwcLenSource; } [StructLayout(LayoutKind.Sequential)] public struct FILTERREGION { public uint idChunk; public uint cwcStart; public uint cwcExtent; } [ComImport] [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")] [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] public interface IFilter { void Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags, uint cAttributes, [MarshalAs(UnmanagedType.LPArray, SizeParamIndex=1)] FULLPROPSPEC[] aAttributes, ref uint pdwFlags); //void GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat); [PreserveSig] int GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat); //[PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer); [PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPArray)] byte[] buffer); void GetValue(ref UIntPtr ppPropValue); void BindRegion([MarshalAs(UnmanagedType.Struct)]FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk); } [ComImport] [Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")] public class OfficeFilter { } [ComImport] [Guid("4c904448-74a9-11d0-af6e-00c04fd8dc02")] public class PDFFilter { } [ComImport] [Guid("faea5b46-761b-400e-b53e-e805a97a543e")] public class VisioFilter { } [ComImport] [Guid("e0ca5340-4534-11cf-b952-00aa0051fe20")] public class HtmlFilter { } [ComImport] [Guid("c1243ca0-bf96-11cd-b579-08002b30bfeb")] public class PlainTextFilter { } public class Constants { public const uint PID_STG_DIRECTORY =0x00000002; public const uint PID_STG_CLASSID =0x00000003; public const uint PID_STG_STORAGETYPE =0x00000004; public const uint PID_STG_VOLUME_ID =0x00000005; public const uint PID_STG_PARENT_WORKID =0x00000006; public const uint PID_STG_SECONDARYSTORE =0x00000007; public const uint PID_STG_FILEINDEX =0x00000008; public const uint PID_STG_LASTCHANGEUSN =0x00000009; public const uint PID_STG_NAME =0x0000000a; public const uint PID_STG_PATH =0x0000000b; public const uint PID_STG_SIZE =0x0000000c; public const uint PID_STG_ATTRIBUTES =0x0000000d; public const uint PID_STG_WRITETIME =0x0000000e; public const uint PID_STG_CREATETIME =0x0000000f; public const uint PID_STG_ACCESSTIME =0x00000010; public const uint PID_STG_CHANGETIME =0x00000011; public const uint PID_STG_CONTENTS =0x00000013; public const uint PID_STG_SHORTNAME =0x00000014; public const int FILTER_E_END_OF_CHUNKS =(unchecked((int)0x80041700)); public const int FILTER_E_NO_MORE_TEXT =(unchecked((int)0x80041701)); public const int FILTER_E_NO_MORE_VALUES =(unchecked((int)0x80041702)); public const int FILTER_E_NO_TEXT =(unchecked((int)0x80041705)); public const int FILTER_E_NO_VALUES =(unchecked((int)0x80041706)); public const int FILTER_S_LAST_TEXT =(unchecked((int)0x00041709)); } }
using System; using System.Collections; using System.IO; using System.Runtime.InteropServices; using System.Reflection; using System.Text; namespace Yuki { [ComVisible(false)] public class FilterFactory { private static Hashtable filterMap; static FilterFactory() { filterMap = new Hashtable(); filterMap.Add(".doc",typeof(OfficeFilter)); filterMap.Add(".xls", typeof(OfficeFilter)); filterMap.Add(".ppt", typeof(OfficeFilter)); filterMap.Add(".vsd", typeof(VisioFilter)); filterMap.Add(".pdf", typeof(PDFFilter)); filterMap.Add(".htm", typeof(HtmlFilter)); filterMap.Add(".html", typeof(PDFFilter)); } private FilterFactory() { } public static IFilter GetInstance(string filePath) { if (filePath == null || filePath ==string.Empty) { throw new ArgumentNullException(); } FileInfo fi = new FileInfo(filePath); if (!fi.Exists) { throw new ArgumentException(); } Type t = null; object typeobject = filterMap[fi.Extension.ToLower()]; if(typeobject != null) { t = (Type) typeobject; } else { t = typeof(PlainTextFilter); } object o = t.GetConstructor(new Type[0]).Invoke(new object[0]); IFilter filter = (IFilter)o; UCOMIPersistFile ipf = (UCOMIPersistFile) filter; ipf.Load(filePath, 0); return filter; } } }
using System; using System.Collections; using System.IO; using System.Text; using System.Runtime.InteropServices; namespace Yuki { [ComVisible(false)] public class FilterUtil { private FilterUtil() { } public static string GetText(IFilter filter) { StringBuilder sbResult = new StringBuilder(); uint i = 0; STAT_CHUNK ps = new STAT_CHUNK(); //filter.Init(0, 0, null, ref i); filter.Init(IFILTER_INIT.CANON_SPACES | IFILTER_INIT.CANON_HYPHENS | IFILTER_INIT.CANON_PARAGRAPHS | IFILTER_INIT.HARD_LINE_BREAKS | IFILTER_INIT.APPLY_INDEX_ATTRIBUTES , 0, null, ref i); int hr = 0;//HRESULT while (hr == 0) { hr = filter.GetChunk(out ps); if (ps.flags == CHUNKSTATE.CHUNK_TEXT) { uint pcwcBuffer = 1000; int hr2 = 0; while (hr2 == Constants.FILTER_S_LAST_TEXT || hr2 == 0) { pcwcBuffer = 1000; byte[] buffer = new byte[pcwcBuffer]; hr2 = filter.GetText(ref pcwcBuffer, buffer); if (hr2 != Constants.FILTER_E_NO_TEXT && hr2 != Constants.FILTER_E_NO_MORE_TEXT) { if(buffer.Length < pcwcBuffer) { pcwcBuffer = (uint)buffer.Length; } char[] chars = new char[1000]; for(int j = 0; j < buffer.Length; j= j+2) { chars[j/2] = BitConverter.ToChar(buffer,j); } string text = new string(chars,0,(int)pcwcBuffer); sbResult.Append(text); } } } } return sbResult.ToString(); } } }