Indexer des fichiers sous .NET

02 Mar 2008

Pré-requis : Installer les iFilter Adobe et Office.

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Runtime.InteropServices;

namespace Project.Documents
{

    #region Class Parser

    sealed class Parser
    {

        #region Enums and Structs

        [Flags]
        public enum IFILTER_INIT
        {
            NONE = 0,
            CANON_PARAGRAPHS = 1,
            HARD_LINE_BREAKS = 2,
            CANON_HYPHENS = 4,
            CANON_SPACES = 8,
            APPLY_INDEX_ATTRIBUTES = 16,
            APPLY_CRAWL_ATTRIBUTES = 256,
            APPLY_OTHER_ATTRIBUTES = 32,
            INDEXING_ONLY = 64,
            SEARCH_LINKS = 128,
            FILTER_OWNED_VALUE_OK = 512
        }

        [Flags]
        public enum IFILTER_FLAGS
        {
            OLE_PROPERTIES = 1
        }

        public enum CHUNK_BREAKTYPE
        {
            CHUNK_NO_BREAK = 0,
            CHUNK_EOW = 1,
            CHUNK_EOS = 2,
            CHUNK_EOP = 3,
            CHUNK_EOC = 4
        }

        [Flags]
        public enum CHUNKSTATE
        {
            CHUNK_TEXT = 0x1,
            CHUNK_VALUE = 0x2,
            CHUNK_FILTER_OWNED_VALUE = 0x4
        }

        public enum PSKIND
        {
            LPWSTR = 0,
            PROPID = 1
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct PROPSPEC
        {
            public uint ulKind;
            public uint propid;
            public IntPtr lpwstr;
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct FULLPROPSPEC
        {
            public Guid guidPropSet;
            public PROPSPEC psProperty;
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct STAT_CHUNK
        {
            public uint idChunk;

            [MarshalAs(UnmanagedType.U4)]
            public CHUNK_BREAKTYPE breakType;

            [MarshalAs(UnmanagedType.U4)]
            public CHUNKSTATE flags;

            public uint locale;

            [MarshalAs(UnmanagedType.Struct)]
            public FULLPROPSPEC attribute;

            public uint idChunkSource;

            public uint cwcStartSource;

            public uint cwcLenSource;
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct FILTERREGION
        {
            public uint idChunk;
            public uint cwcStart;
            public uint cwcExtent;
        }

        #region IFilterReturnCodes Enumeration Definition

        enum IFilterReturnCodes : uint
        {
            S_OK = 0,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_INVALIDARG = 0x80070057,
            E_OUTOFMEMORY = 0x8007000E,
            E_NOTIMPL = 0x80004001,
            E_FAIL = 0x80000008,
            FILTER_E_PASSWORD = 0x8004170B,
            FILTER_E_UNKNOWNFORMAT = 0x8004170C,
            FILTER_E_NO_TEXT = 0x80041705,
            FILTER_E_NO_VALUES = 0x80041706,
            FILTER_E_END_OF_CHUNKS = 0x80041700,
            FILTER_E_NO_MORE_TEXT = 0x80041701,
            FILTER_E_NO_MORE_VALUES = 0x80041702,
            FILTER_E_ACCESS = 0x80041703,
            FILTER_W_MONIKER_CLIPPED = 0x00041704,
            FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
            FILTER_E_LINK_UNAVAILABLE = 0x80041708,
            FILTER_S_LAST_TEXT = 0x00041709,
            FILTER_S_LAST_VALUES = 0x0004170A
        }

        #endregion

        #endregion

        #region Constants

        public class Constants
        {
            public const uint PID_STG_DIRECTORY = 0x00000002;
            public const uint PID_STG_CLASSID = 0x00000003;
            public const uint PID_STG_STORAGETYPE = 0x00000004;
            public const uint PID_STG_VOLUME_ID = 0x00000005;
            public const uint PID_STG_PARENT_WORKID = 0x00000006;
            public const uint PID_STG_SECONDARYSTORE = 0x00000007;
            public const uint PID_STG_FILEINDEX = 0x00000008;
            public const uint PID_STG_LASTCHANGEUSN = 0x00000009;
            public const uint PID_STG_NAME = 0x0000000a;
            public const uint PID_STG_PATH = 0x0000000b;
            public const uint PID_STG_SIZE = 0x0000000c;
            public const uint PID_STG_ATTRIBUTES = 0x0000000d;
            public const uint PID_STG_WRITETIME = 0x0000000e;
            public const uint PID_STG_CREATETIME = 0x0000000f;
            public const uint PID_STG_ACCESSTIME = 0x00000010;
            public const uint PID_STG_CHANGETIME = 0x00000011;
            public const uint PID_STG_CONTENTS = 0x00000013;
            public const uint PID_STG_SHORTNAME = 0x00000014;
            public const int FILTER_E_END_OF_CHUNKS = (unchecked((int)0x80041700));
            public const int FILTER_E_NO_MORE_TEXT = (unchecked((int)0x80041701));
            public const int FILTER_E_NO_MORE_VALUES = (unchecked((int)0x80041702));
            public const int FILTER_E_NO_TEXT = (unchecked((int)0x80041705));
            public const int FILTER_E_NO_VALUES = (unchecked((int)0x80041706));
            public const int FILTER_S_LAST_TEXT = (unchecked((int)0x00041709));
        }

        #endregion

        #region Interface IFilter

        [ComImport]
        [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IFilter
        {
            void Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags,
                      uint cAttributes,
                      [MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] FULLPROPSPEC[] aAttributes,
                      ref uint pdwFlags);

            [PreserveSig]
            int GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat);

            [PreserveSig]
            int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer);

            void GetValue(ref UIntPtr ppPropValue);

            void BindRegion([MarshalAs(UnmanagedType.Struct)]FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk);
        }

        #endregion

        #region Class CFilter

        // Office : [Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")]
        // PDF : [Guid("4C904448-74A9-11d0-AF6E-00C04FD8DC02")]
        [ComImport]
        [Guid("4C904448-74A9-11d0-AF6E-00C04FD8DC02")]
        public class CFilter { }

        #endregion

        #region Interface IUnknown

        [ComImport, Guid("00000000-0000-0000-C000-000000000046")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        private interface IUnknown
        {
            [PreserveSig]
            IntPtr QueryInterface(ref Guid riid, out IntPtr pVoid);

            [PreserveSig]
            IntPtr AddRef();

            [PreserveSig]
            IntPtr Release();
        }

        #endregion

        #region constructor

        public Parser() { }

        #endregion

        #region private static methods

        [DllImport("query.dll", CharSet = CharSet.Unicode)]
        private extern static int LoadIFilter(string pwcsPath, ref IUnknown pUnkOuter, ref IFilter ppIUnk);

        private static IFilter loadIFilter(string filename)
        {
            IUnknown iunk = null;
            IFilter filter = null;

            // Try to load the corresponding IFilter
            int resultLoad = LoadIFilter(filename, ref iunk, ref filter);
            if (resultLoad != (int)IFilterReturnCodes.S_OK)
            {
                return null;
            }
            return filter;
        }

        #endregion

        #region public static methods

        public static bool IsParseable(string filename)
        {
            return loadIFilter(filename) != null;
        }

        public static string Parse(string filename)
        {
            IFilter filter = null;

            try
            {
                StringBuilder plainTextResult = new StringBuilder();
                filter = loadIFilter(filename);

                STAT_CHUNK ps = new STAT_CHUNK();
                IFILTER_INIT mFlags = 0;

                uint i = 0;
                filter.Init(mFlags, 0, null, ref i);

                int resultChunk = 0;

                resultChunk = filter.GetChunk(out ps);
                while (resultChunk == 0)
                {
                    if (ps.flags == CHUNKSTATE.CHUNK_TEXT)
                    {
                        uint sizeBuffer = 60000;
                        int resultText = 0;
                        while (resultText == Constants.FILTER_S_LAST_TEXT || resultText == 0)
                        {
                            sizeBuffer = 60000;
                            System.Text.StringBuilder sbBuffer = new System.Text.StringBuilder((int)sizeBuffer);
                            resultText = filter.GetText(ref sizeBuffer, sbBuffer);

                            if (sizeBuffer > 0 && sbBuffer.Length > 0)
                            {
                                string chunk = sbBuffer.ToString(0, (int)sizeBuffer);
                                plainTextResult.Append(chunk);
                            }
                        }
                    }
                    resultChunk = filter.GetChunk(out ps);
                }
                return plainTextResult.ToString();
            }
            finally
            {
                if (filter != null)
                    Marshal.ReleaseComObject(filter);
            }
        }

        #endregion

    }

    #endregion

    public class Document
    {

        [...]

        public static String Parse(String _sFile)
        {
            String sOutput = String.Empty;
            if (Parser.IsParseable(_sFile) == true)
            {
                sOutput = Parser.Parse(_sFile);
            }
            return sOutput;
        }

       [...]

    }
}