| 网站首页 | VB.Net实例分析 | .Net技术文摘 | 下载中心 | VB.Net书籍笔记 | .Net源码 | VBS脚本与组件调用 | Windows2008技术文摘 | 给我们留言 | 
最新公告:

  没有公告

您现在的位置: 乐博网 >> VB.Net开发技巧 >> 技术文摘正文
最新推荐 更多内容
IFilter读取文本的方法
IFilter读取文本的方法
作者:Ilu    来源:乐博网整理     更新时间:2009-11-18

IFilter读取文本的方法

代码如下(原文为C#代码,乐博网将其转化为VB.Net):


''' Sample library for using IFilter to read text from any registered filter type.
'''
''' 文章关联的一些地址:
''' http://msdn.microsoft.com/en-us/library/ms691105(VS.85).aspx
''' http://ifilter.codeplex.com/
''' http://www.pinvoke.net/default.aspx/query/LoadIFilter.html
'''
''' Code here is taken from a combination of the project located at http://ifilter.codeplex.com/
''' as well as definitions taken from p-invoke.net. License is MS-PL so enjoy.
'''

Imports System
Imports System.Collections.Generic
Imports System.Linq
Imports System.Text
Imports System.Runtime.InteropServices

Namespace FilterLibrary
    Public Class FilterCode
        Private DefaultBufferSize As Integer = 4096
       
        ''' <summary>
        ''' Utilizes IFilter interface in Windows to parse the contents of files.
        ''' </summary>
        ''' <param name="Path">Path - Location of file to parse</param>
        ''' <param name="Path">Buffer - Return text artifacts</param>
        ''' <returns>Raw set of strings from the document in plain text format.</returns>
        Public Sub GetTextFromDocument(ByVal Path As String, ByRef Buffer As StringBuilder)
            Dim filter As IFilter = Nothing
            Dim hresult As Integer
            Dim rtn As IFilterReturnCodes
           
            ' Initialize the return buffer to 64K.
            Buffer = New StringBuilder(64 * 1024)
           
            ' Try to load the filter for the path given.
            hresult = LoadIFilter(Path, New IntPtr(0), filter)
            If hresult = 0 Then
                Dim uflags As IFILTER_FLAGS
               
                ' Init the filter provider.
                rtn = filter.Init(IFILTER_INIT.IFILTER_INIT_CANON_PARAGRAPHS Or IFILTER_INIT.IFILTER_INIT_CANON_HYPHENS Or IFILTER_INIT.IFILTER_INIT_CANON_SPACES Or IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES Or IFILTER_INIT.IFILTER_INIT_INDEXING_ONLY, 0, New IntPtr(0), uflags)
                If rtn = IFilterReturnCodes.S_OK Then
                    Dim statChunk As STAT_CHUNK
                   
                    ' Outer loop will read chunks from the document at a time. For those
                    ' chunks that have text, the contents will be pulled and put into the
                    ' return buffer.
                    Dim bMoreChunks As Boolean = True
                    While bMoreChunks
                        rtn = filter.GetChunk(statChunk)
                        If rtn = IFilterReturnCodes.S_OK Then
                            ' Ignore all non-text chunks.
                            If statChunk.flags <> CHUNKSTATE.CHUNK_TEXT Then
                                Continue While
                            End If
                           
                            ' Check for white space items and add the appropriate breaks.
                            Select Case statChunk.breakType
                                Case CHUNK_BREAKTYPE.CHUNK_NO_BREAK
                                    Exit Select
                               
                                Case CHUNK_BREAKTYPE.CHUNK_EOW
                                    Buffer.Append(" "c)
                                    Exit Select
                               
                                Case CHUNK_BREAKTYPE.CHUNK_EOC, CHUNK_BREAKTYPE.CHUNK_EOP, CHUNK_BREAKTYPE.CHUNK_EOS
                                    Buffer.AppendLine()
                                    Exit Select
                            End Select
                           
                            ' At this point we have a text chunk. The following code will pull out
                            ' all of it and add it to the buffer.
                            Dim bMoreText As Boolean = True
                            While bMoreText
                                ' Create a temporary string buffer we can use for the parsing algorithm.
                                Dim cBuffer As Integer = DefaultBufferSize
                                Dim sbBuffer As New StringBuilder(DefaultBufferSize)
                               
                                ' Read the next piece of data up to the size of our local buffer.
                                rtn = filter.GetText(cBuffer, sbBuffer)
                                If rtn = IFilterReturnCodes.S_OK OrElse rtn = IFilterReturnCodes.FILTER_S_LAST_TEXT Then
                                    ' If any data was returned, scrub it and then add it to the buffer.
                                    CleanUpCharacters(cBuffer, sbBuffer)
                                    Buffer.Append(sbBuffer.ToString())
                                   
                                    ' If we got back some text but there is no more, terminate the loop.
                                    If rtn = IFilterReturnCodes.FILTER_S_LAST_TEXT Then
                                        bMoreText = False
                                        Exit While
                                    End If
                                ' Once all data is exhausted, we are done so terminate.
                                ElseIf rtn = IFilterReturnCodes.FILTER_E_NO_MORE_TEXT Then
                                    bMoreText = False
                                    Exit While
                                ' Check for any fatal errors. It is a bug if you land here.
                                ElseIf rtn = IFilterReturnCodes.FILTER_E_NO_TEXT Then
                                    System.Diagnostics.Debug.Assert(False, "Should not get here")
                                    Throw New InvalidOperationException()
                                End If
                            End While
                        ' Once all chunks have been read, we are done with the file.
                        ElseIf rtn = IFilterReturnCodes.FILTER_E_END_OF_CHUNKS Then
                            bMoreChunks = False
                            Exit While
                        ElseIf rtn = IFilterReturnCodes.FILTER_E_EMBEDDING_UNAVAILABLE OrElse rtn = IFilterReturnCodes.FILTER_E_LINK_UNAVAILABLE Then
                            Continue While
                        Else
                            Throw New COMException("IFilter COM error: " & rtn.ToString())
                        End If
                    End While
                End If
            Else
                ' If you get here there is no filter for the file type you asked for. Throw an
                ' exception for the caller.
                Throw New InvalidOperationException("Failed to find IFilter for file " & Path)
            End If
        End Sub
       
        <DllImport("query.dll", SetLastError := True, CharSet := CharSet.Unicode)> _
        Private Shared Function LoadIFilter(ByVal pwcsPath As String, <MarshalAs(UnmanagedType.IUnknown)> ByVal pUnkOuter As Object, ByRef ppIUnk As IFilter) As Integer
        End Function
       
        <ComImport(), Guid("89BCB740-6119-101A-BCB7-00DD010655AF")> _
        <InterfaceType(ComInterfaceType.InterfaceIsIUnknown)> _
        Public Interface IFilter
            ''' <summary>
            ''' The IFilter::Init method initializes a filtering session.
            ''' </summary>
            '[in] Flag settings from the IFILTER_INIT enumeration for
            ' controlling text standardization, property output, embedding
            ' scope, and IFilter access patterns.
           
            ' [in] The size of the attributes array. When nonzero, cAttributes
            ' takes
            ' precedence over attributes specified in grfFlags. If no
            ' attribute flags
            ' are specified and cAttributes is zero, the default is given by
            ' the
            ' PSGUID_STORAGE storage property set, which contains the date and
            ' time
            ' of the last write to the file, size, and so on; and by the
            ' PID_STG_CONTENTS
            ' 'contents' property, which maps to the main contents of the
            ' file.
            ' For more information about properties and property sets, see
            ' Property Sets.
           
            '[in] Array of pointers to FULLPROPSPEC structures for the
            ' requested properties.
            ' When cAttributes is nonzero, only the properties in aAttributes
            ' are returned.
           
            ' [out] Information about additional properties available to the
            ' caller; from the IFILTER_FLAGS enumeration.
            <PreserveSig()> _
            Function Init(ByVal grfFlags As IFILTER_INIT, ByVal cAttributes As Integer, ByVal aAttributes As IntPtr, ByRef pdwFlags As IFILTER_FLAGS) As IFilterReturnCodes
           
            ''' <summary>
            ''' The IFilter::GetChunk method positions the filter at the beginning
            ''' of the next chunk,
            ''' or at the first chunk if this is the first call to the GetChunk
            ''' method, and returns a description of the current chunk.
            ''' </summary>
            <PreserveSig()> _
            Function GetChunk(ByRef pStat As STAT_CHUNK) As IFilterReturnCodes
           
            ''' <summary>
            ''' The IFilter::GetText method retrieves text (text-type properties)
            ''' from the current chunk,
            ''' which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
            ''' </summary>
            ' [in/out] On entry, the size of awcBuffer array in wide/Unicode
            ' characters. On exit, the number of Unicode characters written to
            ' awcBuffer.
            ' Note that this value is not the number of bytes in the buffer.
           
            ' Text retrieved from the current chunk. Do not terminate the
            ' buffer with a character.
            <PreserveSig()> _
            Function GetText(ByRef pcwcBuffer As Integer, <Out(), MarshalAs(UnmanagedType.LPWStr)> ByVal awcBuffer As StringBuilder) As IFilterReturnCodes
           
            ''' <summary>
            ''' The IFilter::GetValue method retrieves a value (public
            ''' value-type property) from a chunk,
            ''' which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
            ''' </summary>
            ' Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
            ' PROPVARIANT
            ' structures contain pointers, which can be freed by calling the
            ' PropVariantClear function.
            ' It is up to the caller of the GetValue method to call the
            ' PropVariantClear method.
            ' ref IntPtr ppPropValue
            ' [MarshalAs(UnmanagedType.Struct)]
            <PreserveSig()> _
            Function GetValue(ByRef PropVal As IntPtr) As IFilterReturnCodes
           
            ''' <summary>
            ''' The IFilter::BindRegion method retrieves an interface representing
            ''' the specified portion of the object.
            ''' Currently reserved for future use.
            ''' </summary>
            <PreserveSig()> _
            Function BindRegion(ByRef origPos As FILTERREGION, ByRef riid As Guid, ByRef ppunk As Object) As IFilterReturnCodes
        End Interface
       
        Public Structure FILTERREGION
            Public idChunk As Integer
            Public cwcStart As Integer
            Public cwcExtent As Integer
        End Structure
       
        Public Enum IFilterReturnCodes As UInteger
            ''' <summary>
            ''' Success
            ''' </summary>
            S_OK = 0
            ''' <summary>
            ''' The function was denied access to the filter file.
            ''' </summary>
            E_ACCESSDENIED = &H80070005
            ''' <summary>
            ''' The function encountered an invalid handle,
            ''' probably due to a low-memory situation.
            ''' </summary>
            E_HANDLE = &H80070006
            ''' <summary>
            ''' The function received an invalid parameter.
            ''' </summary>
            E_INVALIDARG = &H80070057
            ''' <summary>
            ''' Out of memory
            ''' </summary>
            E_OUTOFMEMORY = &H8007000e
            ''' <summary>
            ''' Not implemented
            ''' </summary>
            E_NOTIMPL = &H80004001
            ''' <summary>
            ''' Unknown error
            ''' </summary>
            E_FAIL = &H80000008
            ''' <summary>
            ''' File not filtered due to password protection
            ''' </summary>
            FILTER_E_PASSWORD = &H8004170b
            ''' <summary>
            ''' The document format is not recognised by the filter
            ''' </summary>
            FILTER_E_UNKNOWNFORMAT = &H8004170c
            ''' <summary>
            ''' No text in current chunk
            ''' </summary>
            FILTER_E_NO_TEXT = &H80041705
            ''' <summary>
            ''' No values in current chunk
            ''' </summary>
            FILTER_E_NO_VALUES = &H80041706
            ''' <summary>
            ''' No more chunks of text available in object
            ''' </summary>
            FILTER_E_END_OF_CHUNKS = &H80041700
            ''' <summary>
            ''' No more text available in chunk
            ''' </summary>
            FILTER_E_NO_MORE_TEXT = &H80041701
            ''' <summary>
            ''' No more property values available in chunk
            ''' </summary>
            FILTER_E_NO_MORE_VALUES = &H80041702
            ''' <summary>
            ''' Unable to access object
            ''' </summary>
            FILTER_E_ACCESS = &H80041703
            ''' <summary>
            ''' Moniker doesn't cover entire region
            ''' </summary>
            FILTER_W_MONIKER_CLIPPED = &H41704
            ''' <summary>
            ''' Unable to bind IFilter for embedded object
            ''' </summary>
            FILTER_E_EMBEDDING_UNAVAILABLE = &H80041707
            ''' <summary>
            ''' Unable to bind IFilter for linked object
            ''' </summary>
            FILTER_E_LINK_UNAVAILABLE = &H80041708
            ''' <summary>
            ''' This is the last text in the current chunk
            ''' </summary>
            FILTER_S_LAST_TEXT = &H41709
            ''' <summary>
            ''' This is the last value in the current chunk
            ''' </summary>
            FILTER_S_LAST_VALUES = &H4170a
        End Enum
       
       
        ''' <summary>
        ''' Flags controlling the operation of the FileFilter
        ''' instance.
        ''' </summary>
        <Flags()> _
        Public Enum IFILTER_INIT
            ''' <summary>
            ''' Paragraph breaks should be marked with the Unicode PARAGRAPH
            ''' SEPARATOR (0x2029)
            ''' </summary>
            IFILTER_INIT_CANON_PARAGRAPHS = 1
           
            ''' <summary>
            ''' Soft returns, such as the newline character in Microsoft Word, should
            ''' be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard
            ''' returns can be doubled. A carriage return (0x000D), line feed (0x000A),
            ''' or the carriage return and line feed in combination should be considered
            ''' a hard return. The intent is to enable pattern-expression matches that
            ''' match against observed line breaks.
            ''' </summary>
            IFILTER_INIT_HARD_LINE_BREAKS = 2
           
            ''' <summary>
            ''' Various word-processing programs have forms of hyphens that are not
            ''' represented in the host character set, such as optional hyphens
            ''' (appearing only at the end of a line) and nonbreaking hyphens. This flag
            ''' indicates that optional hyphens are to be converted to nulls, and
            ''' non-breaking hyphens are to be converted to normal hyphens (0x2010), or
            ''' HYPHEN-MINUSES (0x002D).
            ''' </summary>
            IFILTER_INIT_CANON_HYPHENS = 4
           
            ''' <summary>
            ''' Just as the IFILTER_INIT_CANON_HYPHENS flag standardizes hyphens,
            ''' this one standardizes spaces. All special space characters, such as
            ''' nonbreaking spaces, are converted to the standard space character
            ''' (0x0020).
            ''' </summary>
            IFILTER_INIT_CANON_SPACES = 8
           
            ''' <summary>
            ''' Indicates that the client wants text split into chunks representing
            ''' public value-type properties.
            ''' </summary>
            IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16
           
            ''' <summary>
            ''' Indicates that the client wants text split into chunks representing
            ''' properties determined during the indexing process.
            ''' </summary>
            IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256
           
            ''' <summary>
            ''' Any properties not covered by the IFILTER_INIT_APPLY_INDEX_ATTRIBUTES
            ''' and IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES flags should be emitted.
            ''' </summary>
            IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32
           
            ''' <summary>
            ''' Optimizes IFilter for indexing because the client calls the
            ''' IFilter::Init method only once and does not call IFilter::BindRegion.
            ''' This eliminates the possibility of accessing a chunk both before and
            ''' after accessing another chunk.
            ''' </summary>
            IFILTER_INIT_INDEXING_ONLY = 64
           
            ''' <summary>
            ''' The text extraction process must recursively search all linked
            ''' objects within the document. If a link is unavailable, the
            ''' IFilter::GetChunk call that would have obtained the first chunk of the
            ''' link should return FILTER_E_LINK_UNAVAILABLE.
            ''' </summary>
            IFILTER_INIT_SEARCH_LINKS = 128
           
            ''' <summary>
            ''' The content indexing process can return property values set by the filter.
            ''' </summary>
            IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
        End Enum
       
       
       
        <Flags()> _
        Public Enum IFILTER_FLAGS
            ''' <summary>
            ''' The caller should use the IPropertySetStorage and IPropertyStorage
            ''' interfaces to locate additional properties.
            ''' When this flag is set, properties available through COM
            ''' enumerators should not be returned from IFilter.
            ''' </summary>
            IFILTER_FLAGS_OLE_PROPERTIES = 1
        End Enum
       
       
        Public Structure STAT_CHUNK
            ''' <summary>
            ''' The chunk identifier. Chunk identifiers must be unique for the
            ''' current instance of the IFilter interface.
            ''' Chunk identifiers must be in ascending order. The order in which
            ''' chunks are numbered should correspond to the order in which they appear
            ''' in the source document. Some search engines can take advantage of the
            ''' proximity of chunks of various properties. If so, the order in which
            ''' chunks with different properties are emitted will be important to the
            ''' search engine.
            ''' </summary>
            Public idChunk As Integer
           
            ''' <summary>
            ''' The type of break that separates the previous chunk from the current
            ''' chunk. Values are from the CHUNK_BREAKTYPE enumeration.
            ''' </summary>
            <MarshalAs(UnmanagedType.U4)> _
            Public breakType As CHUNK_BREAKTYPE
           
            ''' <summary>
            ''' Flags indicate whether this chunk contains a text-type or a
            ''' value-type property.
            ''' Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set,
            ''' IFilter::GetText should be used to retrieve the contents of the chunk
            ''' as a series of words.
            ''' If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve
            ''' the value and treat it as a single property value. If the filter dictates that the same
            ''' content be treated as both text and as a value, the chunk should be emitted twice in two
            ''' different chunks, each with one flag set.
            ''' </summary>
            <MarshalAs(UnmanagedType.U4)> _
            Public flags As CHUNKSTATE
           
            ''' <summary>
            ''' The language and sublanguage associated with a chunk of text. Chunk locale is used
            ''' by document indexers to perform proper word breaking of text. If the chunk is
            ''' neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR,
            ''' this field is ignored.
            ''' </summary>
            Public locale As Integer
           
            ''' <summary>
            ''' The property to be applied to the chunk. If a filter requires that the same text
            ''' have more than one property, it needs to emit the text once for each property
            ''' in separate chunks.
            ''' </summary> '还有下一页

 

[1] [2] 下一页

  • 上一篇:

  • 下一篇:
  • 【字体: 】【打印此文】【关闭窗口
      相关文章:(只显示最新16条)
    ASP.NET读取文本文件的方法
    VB.Net实现快速文本搜索的方法
    VB.Net实现动态配置exe文件

    | 设为首页 | 加入收藏 | 联系站长 | | 友情链接 | 版权申明 |
    乐博网欢迎各种媒体转载我们的原创作品[转载请注明出处];我们鼓励更多VB.Net开发者一起加入研究与探讨;如发现文章访问错误、内容错误或版权疑问、内容有违相关法律(如涉及政治、色情、反动或散布虚假有害信息)等情况,请及时向我们举报,我们将及时纠正!
    联系邮箱:Shiny#vip.qq.com (#替换为@) QQ交流群: 40797788 [闽ICP备05014267号]