ドキュメントからテキストのみを抽出する

先日の修正版
http://d.hatena.ne.jp/m-tanaka/20080129

FilterがNULL文字列を返却したさいに、StringBuilderだとうまく扱えないので、少し修正

//参考
//http://sqljunkies.com/HowTo/C4AC6E97-8D84-411D-8551-08CE63EC99B6.scuk
using System;
using System.Text;
using System.Runtime.InteropServices;

namespace Yuki
{
	[Flags]
	public enum IFILTER_INIT
	{
		NONE                   = 0,
		CANON_PARAGRAPHS       = 1,
		HARD_LINE_BREAKS       = 2,
		CANON_HYPHENS          = 4,
		CANON_SPACES           = 8,
		APPLY_INDEX_ATTRIBUTES = 16,
		APPLY_CRAWL_ATTRIBUTES = 256,
		APPLY_OTHER_ATTRIBUTES = 32,
		INDEXING_ONLY          = 64,
		SEARCH_LINKS           = 128,        
		FILTER_OWNED_VALUE_OK  = 512
	}
	[Flags]
	public enum IFILTER_FLAGS
	{
		OLE_PROPERTIES = 1
	}
	public enum CHUNK_BREAKTYPE
	{
		CHUNK_NO_BREAK = 0,
		CHUNK_EOW      = 1,
		CHUNK_EOS      = 2,
		CHUNK_EOP      = 3,
		CHUNK_EOC      = 4
	}
	[Flags]
	public enum CHUNKSTATE
	{
		CHUNK_TEXT               = 0x1,
		CHUNK_VALUE              = 0x2,
		CHUNK_FILTER_OWNED_VALUE = 0x4
	}
	public enum PSKIND
	{
		LPWSTR = 0,
		PROPID = 1
	}
	[StructLayout(LayoutKind.Sequential)]
	public struct PROPSPEC
	{
		public uint ulKind;
		public uint propid;
		public IntPtr lpwstr;
	}
	[StructLayout(LayoutKind.Sequential)]
	public struct FULLPROPSPEC
	{
		public Guid guidPropSet;
		public PROPSPEC psProperty;
	}
	[StructLayout(LayoutKind.Sequential)]
	public struct STAT_CHUNK
	{
		public uint  idChunk;
		[MarshalAs(UnmanagedType.U4)]     public CHUNK_BREAKTYPE breakType;
		[MarshalAs(UnmanagedType.U4)]     public CHUNKSTATE flags;
		public uint locale;
		[MarshalAs(UnmanagedType.Struct)] public FULLPROPSPEC attribute;
		public uint idChunkSource;
		public uint cwcStartSource;
		public uint cwcLenSource;
	}
	[StructLayout(LayoutKind.Sequential)]
	public struct FILTERREGION
	{
		public uint idChunk;
		public uint cwcStart;
		public uint cwcExtent;
	}
	
	[ComImport]
	[Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
	[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
	public interface IFilter
	{
		void Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags, 
			uint cAttributes,
			[MarshalAs(UnmanagedType.LPArray, SizeParamIndex=1)] FULLPROPSPEC[] aAttributes,
			ref uint pdwFlags);
		//void GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat);
		[PreserveSig] int GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat);
		//[PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer);
		[PreserveSig] int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPArray)]  byte[] buffer);
		void GetValue(ref UIntPtr ppPropValue);
		void BindRegion([MarshalAs(UnmanagedType.Struct)]FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk);
	}
	
	[ComImport]
	[Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")]
	public class OfficeFilter
	{
	}

	
	[ComImport]
	[Guid("4c904448-74a9-11d0-af6e-00c04fd8dc02")]
	public class PDFFilter
	{
	}

	
	[ComImport]
	[Guid("faea5b46-761b-400e-b53e-e805a97a543e")]
	public class VisioFilter
	{
	}

	
	[ComImport]
	[Guid("e0ca5340-4534-11cf-b952-00aa0051fe20")]
	public class HtmlFilter
	{
	}

	
	[ComImport]
	[Guid("c1243ca0-bf96-11cd-b579-08002b30bfeb")]
	public class PlainTextFilter
	{
	}

	public class Constants
	{
		public const uint PID_STG_DIRECTORY               =0x00000002;
		public const uint PID_STG_CLASSID                 =0x00000003;
		public const uint PID_STG_STORAGETYPE             =0x00000004;
		public const uint PID_STG_VOLUME_ID               =0x00000005;
		public const uint PID_STG_PARENT_WORKID           =0x00000006;
		public const uint PID_STG_SECONDARYSTORE          =0x00000007;
		public const uint PID_STG_FILEINDEX               =0x00000008;
		public const uint PID_STG_LASTCHANGEUSN           =0x00000009;
		public const uint PID_STG_NAME                    =0x0000000a;
		public const uint PID_STG_PATH                    =0x0000000b;
		public const uint PID_STG_SIZE                    =0x0000000c;
		public const uint PID_STG_ATTRIBUTES              =0x0000000d;
		public const uint PID_STG_WRITETIME               =0x0000000e;
		public const uint PID_STG_CREATETIME              =0x0000000f;
		public const uint PID_STG_ACCESSTIME              =0x00000010;
		public const uint PID_STG_CHANGETIME              =0x00000011;
		public const uint PID_STG_CONTENTS                =0x00000013;
		public const uint PID_STG_SHORTNAME               =0x00000014;
		public const int  FILTER_E_END_OF_CHUNKS          =(unchecked((int)0x80041700));
		public const int  FILTER_E_NO_MORE_TEXT           =(unchecked((int)0x80041701));
		public const int  FILTER_E_NO_MORE_VALUES         =(unchecked((int)0x80041702));
		public const int  FILTER_E_NO_TEXT                =(unchecked((int)0x80041705));
		public const int  FILTER_E_NO_VALUES              =(unchecked((int)0x80041706));
		public const int  FILTER_S_LAST_TEXT              =(unchecked((int)0x00041709));
	}
}

using System;
using System.Collections;
using System.IO;
using System.Runtime.InteropServices;
using System.Reflection;
using System.Text;

namespace Yuki
{
	[ComVisible(false)]
	public class FilterFactory
	{
		private static Hashtable filterMap;

		static FilterFactory()
		{
			filterMap = new Hashtable();
			filterMap.Add(".doc",typeof(OfficeFilter));
			filterMap.Add(".xls", typeof(OfficeFilter));
			filterMap.Add(".ppt", typeof(OfficeFilter));

			filterMap.Add(".vsd", typeof(VisioFilter));

			filterMap.Add(".pdf", typeof(PDFFilter));

			filterMap.Add(".htm", typeof(HtmlFilter));
			filterMap.Add(".html", typeof(PDFFilter));

		}

		private FilterFactory()
		{
		}

		public static IFilter GetInstance(string filePath)
		{
			if (filePath == null || filePath ==string.Empty)
			{
				throw new ArgumentNullException();
			}

			FileInfo fi = new FileInfo(filePath);
			if (!fi.Exists)
			{
				throw new ArgumentException();
			}

			Type t = null;
			object typeobject = filterMap[fi.Extension.ToLower()];
			if(typeobject != null)
			{
				t = (Type) typeobject;
			}
			else
			{
				t = typeof(PlainTextFilter);
			}

			object o = t.GetConstructor(new Type[0]).Invoke(new object[0]);
			IFilter filter = (IFilter)o;

			UCOMIPersistFile ipf = (UCOMIPersistFile) filter;
			ipf.Load(filePath, 0);
            
			return filter;
		}
	}
}
using System;
using System.Collections;
using System.IO;
using System.Text;
using System.Runtime.InteropServices;
namespace Yuki
{

	[ComVisible(false)]
	public class FilterUtil
	{
		private FilterUtil()
		{ }

		public static string GetText(IFilter filter)
		{
			StringBuilder sbResult = new StringBuilder();
			uint i = 0;
			STAT_CHUNK ps = new STAT_CHUNK();
			//filter.Init(0, 0, null, ref i);
			filter.Init(IFILTER_INIT.CANON_SPACES 
						| IFILTER_INIT.CANON_HYPHENS
						| IFILTER_INIT.CANON_PARAGRAPHS
						| IFILTER_INIT.HARD_LINE_BREAKS
						| IFILTER_INIT.APPLY_INDEX_ATTRIBUTES , 0, null, ref i);
			int hr = 0;//HRESULT
			while (hr == 0)
			{
				hr = filter.GetChunk(out ps);
				if (ps.flags == CHUNKSTATE.CHUNK_TEXT)
				{
					uint pcwcBuffer = 1000;
					int hr2 = 0;
					while (hr2 == Constants.FILTER_S_LAST_TEXT || hr2 == 0)
					{
						pcwcBuffer = 1000;

						byte[] buffer = new byte[pcwcBuffer];
						hr2 = filter.GetText(ref pcwcBuffer, buffer);
						if (hr2 != Constants.FILTER_E_NO_TEXT && hr2 != Constants.FILTER_E_NO_MORE_TEXT)
						{
							if(buffer.Length < pcwcBuffer)
							{
								pcwcBuffer = (uint)buffer.Length;
							}
							
							char[] chars = new char[1000];
							for(int j = 0; j < buffer.Length; j= j+2)
							{
								chars[j/2] = BitConverter.ToChar(buffer,j);
							}

							string text = new string(chars,0,(int)pcwcBuffer);
							sbResult.Append(text);
						}

					}
				}
			}

			return sbResult.ToString();
		}
	}
}