ドキュメントからテキストのみを抽出する
先日の修正版のさらに修正版
http://d.hatena.ne.jp/m-tanaka/20080129
http://d.hatena.ne.jp/m-tanaka/20080206
FilterがNULL文字列を返却したさいに、StringBuilderだとうまく扱えないので、少し修正
//参考 //http://sqljunkies.com/HowTo/C4AC6E97-8D84-411D-8551-08CE63EC99B6.scuk using System; using System.Text; using System.Runtime.InteropServices; namespace Yuki { [Flags] public enum IFILTER_INIT { NONE = 0, CANON_PARAGRAPHS = 1, HARD_LINE_BREAKS = 2, CANON_HYPHENS = 4, CANON_SPACES = 8, APPLY_INDEX_ATTRIBUTES = 16, APPLY_CRAWL_ATTRIBUTES = 256, APPLY_OTHER_ATTRIBUTES = 32, INDEXING_ONLY = 64, SEARCH_LINKS = 128, FILTER_OWNED_VALUE_OK = 512 } [Flags] public enum IFILTER_FLAGS { OLE_PROPERTIES = 1 } public enum CHUNK_BREAKTYPE { CHUNK_NO_BREAK = 0, CHUNK_EOW = 1, CHUNK_EOS = 2, CHUNK_EOP = 3, CHUNK_EOC = 4 } [Flags] public enum CHUNKSTATE { CHUNK_TEXT = 0x1, CHUNK_VALUE = 0x2, CHUNK_FILTER_OWNED_VALUE = 0x4 } public enum PSKIND { LPWSTR = 0, PROPID = 1 } [StructLayout(LayoutKind.Sequential)] public struct PROPSPEC { public uint ulKind; public uint propid; public IntPtr lpwstr; } [StructLayout(LayoutKind.Sequential)] public struct FULLPROPSPEC { public Guid guidPropSet; public PROPSPEC psProperty; } [StructLayout(LayoutKind.Sequential)] public struct STAT_CHUNK { public uint idChunk; [MarshalAs(UnmanagedType.U4)] public CHUNK_BREAKTYPE breakType; [MarshalAs(UnmanagedType.U4)] public CHUNKSTATE flags; public uint locale; [MarshalAs(UnmanagedType.Struct)] public FULLPROPSPEC attribute; public uint idChunkSource; public uint cwcStartSource; public uint cwcLenSource; } [StructLayout(LayoutKind.Sequential)] public struct FILTERREGION { public uint idChunk; public uint cwcStart; public uint cwcExtent; } [ComImport] [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")] [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] public interface IFilter { void Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags, uint cAttributes, [MarshalAs(UnmanagedType.LPArray, SizeParamIndex=1)] FULLPROPSPEC[] aAttributes, ref uint pdwFlags); //void GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat); [PreserveSig] int GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat); //[PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer); //[PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPArray)] byte[] buffer); [PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPArray)] char[] buffer); void GetValue(ref UIntPtr ppPropValue); void BindRegion([MarshalAs(UnmanagedType.Struct)]FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk); } [ComImport] [Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")] public class OfficeFilter { } [ComImport] [Guid("4c904448-74a9-11d0-af6e-00c04fd8dc02")] public class PDFFilter { } [ComImport] [Guid("faea5b46-761b-400e-b53e-e805a97a543e")] public class VisioFilter { } [ComImport] [Guid("e0ca5340-4534-11cf-b952-00aa0051fe20")] public class HtmlFilter { } [ComImport] [Guid("c1243ca0-bf96-11cd-b579-08002b30bfeb")] public class PlainTextFilter { } public class Constants { public const uint PID_STG_DIRECTORY =0x00000002; public const uint PID_STG_CLASSID =0x00000003; public const uint PID_STG_STORAGETYPE =0x00000004; public const uint PID_STG_VOLUME_ID =0x00000005; public const uint PID_STG_PARENT_WORKID =0x00000006; public const uint PID_STG_SECONDARYSTORE =0x00000007; public const uint PID_STG_FILEINDEX =0x00000008; public const uint PID_STG_LASTCHANGEUSN =0x00000009; public const uint PID_STG_NAME =0x0000000a; public const uint PID_STG_PATH =0x0000000b; public const uint PID_STG_SIZE =0x0000000c; public const uint PID_STG_ATTRIBUTES =0x0000000d; public const uint PID_STG_WRITETIME =0x0000000e; public const uint PID_STG_CREATETIME =0x0000000f; public const uint PID_STG_ACCESSTIME =0x00000010; public const uint PID_STG_CHANGETIME =0x00000011; public const uint PID_STG_CONTENTS =0x00000013; public const uint PID_STG_SHORTNAME =0x00000014; public const int FILTER_E_END_OF_CHUNKS =(unchecked((int)0x80041700)); public const int FILTER_E_NO_MORE_TEXT =(unchecked((int)0x80041701)); public const int FILTER_E_NO_MORE_VALUES =(unchecked((int)0x80041702)); public const int FILTER_E_NO_TEXT =(unchecked((int)0x80041705)); public const int FILTER_E_NO_VALUES =(unchecked((int)0x80041706)); public const int FILTER_S_LAST_TEXT =(unchecked((int)0x00041709)); } }
using System; using System.Collections; using System.IO; using System.Text; using System.Runtime.InteropServices; namespace Yuki { [ComVisible(false)] public class FilterUtil { private FilterUtil() { } public static string GetText(IFilter filter) { StringBuilder sbResult = new StringBuilder(); uint i = 0; STAT_CHUNK ps = new STAT_CHUNK(); //filter.Init(0, 0, null, ref i); filter.Init(IFILTER_INIT.CANON_SPACES | IFILTER_INIT.CANON_HYPHENS | IFILTER_INIT.CANON_PARAGRAPHS | IFILTER_INIT.HARD_LINE_BREAKS | IFILTER_INIT.APPLY_INDEX_ATTRIBUTES | IFILTER_INIT.FILTER_OWNED_VALUE_OK, 0, null, ref i); int hr = 0;//HRESULT while (hr == 0) { hr = filter.GetChunk(out ps); if (ps.flags == CHUNKSTATE.CHUNK_TEXT) { uint pcwcBuffer = 1000; int hr2 = 0; while (hr2 == Constants.FILTER_S_LAST_TEXT || hr2 == 0) { pcwcBuffer = 1000; // byte[] buffer = new byte[pcwcBuffer]; // hr2 = filter.GetText(ref pcwcBuffer, buffer); // if (hr2 != Constants.FILTER_E_NO_TEXT && hr2 != Constants.FILTER_E_NO_MORE_TEXT) // { // if(buffer.Length < pcwcBuffer) // { // pcwcBuffer = (uint)buffer.Length; // } // // char[] chars = new char[1000]; // for(int j = 0; j < buffer.Length; j= j+2) // { // chars[j/2] = BitConverter.ToChar(buffer,j); // } // // string text = new string(chars,0,(int)pcwcBuffer); // sbResult.Append(text); // // } char[] buffer = new char[pcwcBuffer]; hr2 = filter.GetText(ref pcwcBuffer, buffer); if (hr2 != Constants.FILTER_E_NO_TEXT && hr2 != Constants.FILTER_E_NO_MORE_TEXT) { string text = new string(buffer,0,(int)pcwcBuffer); sbResult.Append(text); } } } } return sbResult.ToString(); } } }