Using Internet Explorer's HTML Parser
by Andrew Tucker


Listing One
/* Implement an HTML parser using IE4's IHTMLDocument2 interface. */

#ifndef __HTML_H__
#define __HTML_H__

#include <windows.h>
#include <string>

// if you are using VC6 or higher, get this from the stock include
// directory; otherwise get it from the Internet SDK

#if _MSC_VER >= 1200
#pragma warning(disable:4099)   // disable spurious namespace warnings
#include <mshtmdid.h>
#else
#include "./inetsdk/include/mshtmdid.h"
#endif

#import "mshtml.dll" named_guids no_namespace
using namespace std;

#define WM_USER_LOAD_COMPLETE   WM_USER+1
class HTMLParser: public IPropertyNotifySink, IOleClientSite, IDispatch
{
    public:
        static HTMLParser *Create();    // forces dynamic allocation
        STDMETHOD_(ULONG, Release)(); 
        BOOL LoadHTMLFile(LPCSTR pcszFile);
        long GetLinkCount();
        BOOL GetLinkURL(long lIndex, string &rstrURL);
        long GetImageCount();
        BOOL GetImageURL(long lIndex, string &rstrURL);
        BOOL IsConnected() const { return SUCCEEDED(m_hrConnected); }
    protected:
        // hidden constructors/destructor to force use of Create/Release
        HTMLParser(); 
        HTMLParser(const HTMLParser &); // eliminate compiler 
                                        // synthesized copy ctor
        virtual ~HTMLParser();
     // IUnknown methods
        STDMETHOD(QueryInterface)(REFIID riid, LPVOID* ppv);
        STDMETHOD_(ULONG, AddRef)();
    // IPropertyNotifySink methods
        STDMETHOD(OnChanged)(DISPID dispID);
        STDMETHOD(OnRequestEdit)(DISPID dispID) { return NOERROR; }
        // IOleClientSite methods
        STDMETHOD(SaveObject)(void) 
            { return E_NOTIMPL; }
        STDMETHOD(GetMoniker)(DWORD dwAssign,
                                   DWORD dwWhichMoniker, IMoniker** ppmk)
            { return E_NOTIMPL; }
        STDMETHOD(GetContainer)(IOleContainer** ppContainer)
            { return E_NOTIMPL; }
        STDMETHOD(ShowObject)(void)
            { return E_NOTIMPL; }
        STDMETHOD(OnShowWindow)(BOOL fShow)
            { return E_NOTIMPL; }
        STDMETHOD(RequestNewObjectLayout)(void)
            { return E_NOTIMPL; }
            // IDispatch method
        STDMETHOD(GetTypeInfoCount)(UINT* pctinfo)
            { return E_NOTIMPL; }
        STDMETHOD(GetTypeInfo)(UINT iTInfo, LCID lcid, ITypeInfo** ppTInfo)
            { return E_NOTIMPL; }
        STDMETHOD(GetIDsOfNames)(REFIID riid, LPOLESTR* rgszNames,
                                UINT cNames, LCID lcid, DISPID* rgDispId)
            { return E_NOTIMPL; }
        STDMETHOD(Invoke)(DISPID dispIdMember, REFIID riid, LCID lcid,
            WORD wFlags, DISPPARAMS __RPC_FAR *pDispParams,
            VARIANT __RPC_FAR *pVarResult, EXCEPINFO __RPC_FAR *pExcepInfo,
            UINT __RPC_FAR *puArgErr);
        // helper functions
        BOOL GetURLFromCollection(IHTMLElementCollection *pCollection, 
                                  REFIID rIID, long lIndex, string &rstrURL);
        // member variables
        DWORD   m_dwRef;
        HRESULT  m_hrConnected;
        DWORD    m_dwCookie;
        IHTMLDocument2* m_pMSHTML;
        LPCONNECTIONPOINT m_pCP;
        IHTMLElementCollection *m_pAnchorLinks;
        IHTMLElementCollection *m_pImageLinks;
};
#endif


Listing Two
/* Implement an HTML parser using IE4's IHTMLDocument2 interface. */
#include <windows.h>
#include <comdef.h>
#include <io.h>
#include "html.h"
#include <iostream>
using namespace std;
/* static function used to force dynamic allocation */
HTMLParser *HTMLParser::Create()
{
    return new HTMLParser;
}
// constructor/destructor
HTMLParser::HTMLParser()
{
    HRESULT hr;
    LPCONNECTIONPOINTCONTAINER pCPC = NULL;
    LPOLEOBJECT pOleObject = NULL;
    LPOLECONTROL pOleControl = NULL;
    // initialize all the class member variables
    m_dwRef = 1;    // must start at 1 for the current instance
    m_hrConnected = S_FALSE;
    m_dwCookie = 0;
    m_pMSHTML = NULL;
    m_pCP = NULL;
    m_pAnchorLinks = NULL;
    m_pImageLinks = NULL;
    // Create an instance of an dynamic HTML document
    if (FAILED(hr = CoCreateInstance( CLSID_HTMLDocument, NULL, 
           CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (LPVOID*)&m_pMSHTML )))
    {
        goto Error;
    }
    if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleObject, 
                                                   (LPVOID*)&pOleObject)))
    {
        goto Error;
    }
   hr = pOleObject->SetClientSite((IOleClientSite*)this);
    pOleObject->Release();
    if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleControl, 
                                                   (LPVOID*)&pOleControl)))
    {
        goto Error;
    }
    hr = pOleControl->OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
    pOleControl->Release();
    // Hook up sink to catch ready state property change
    if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IConnectionPointContainer, 
                                                            (LPVOID*)&pCPC)))
    {
        goto Error;
    }
    if (FAILED(hr = pCPC->FindConnectionPoint(IID_IPropertyNotifySink, 
                                                                  &m_pCP)))
    {
        goto Error;
    }
    m_hrConnected = m_pCP->Advise((LPUNKNOWN)(IPropertyNotifySink*)this, 
                                                                &m_dwCookie);
Error:
    if (pCPC) pCPC->Release();
}
HTMLParser::~HTMLParser()
{
    if ( m_pAnchorLinks )
        m_pAnchorLinks->Release();
    if ( m_pImageLinks )
        m_pImageLinks->Release();
    if (SUCCEEDED(m_hrConnected))
        m_pCP->Unadvise(m_dwCookie);
    if (m_pCP) 
        m_pCP->Release();
    if ( m_pMSHTML )
        m_pMSHTML->Release();
}
STDMETHODIMP HTMLParser::QueryInterface(REFIID riid, LPVOID* ppv)
{
    *ppv = NULL;
    if (IID_IUnknown == riid || IID_IPropertyNotifySink == riid)
    {
        *ppv = (LPUNKNOWN)(IPropertyNotifySink*)this;
        AddRef();
        return NOERROR;
    }
    else if (IID_IOleClientSite == riid)
    {
        *ppv = (IOleClientSite*)this;
        AddRef();
        return NOERROR;
    }
    else if (IID_IDispatch == riid)
    {
        *ppv = (IDispatch*)this;
        AddRef();
        return NOERROR;
    }
   else
        return E_NOTIMPL;
}
STDMETHODIMP_(ULONG) HTMLParser::AddRef()
{
    return ++m_dwRef;
}
STDMETHODIMP_(ULONG) HTMLParser::Release()
{
    if (--m_dwRef == 0) 
    { 
        delete this; 
        return 0; 
    }
    return m_dwRef;
}
STDMETHODIMP HTMLParser::OnChanged(DISPID dispID)
{
    HRESULT hr;
    if (DISPID_READYSTATE == dispID)
    {
        VARIANT vResult = {0};
        EXCEPINFO excepInfo;
        UINT uArgErr;
        long lReadyState;
        DISPPARAMS dp = {NULL, NULL, 0, 0};
        if (SUCCEEDED(hr = m_pMSHTML->Invoke(DISPID_READYSTATE, IID_NULL, 
                          LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, 
                          &dp, &vResult, &excepInfo, &uArgErr)))
        {
            lReadyState = (READYSTATE)V_I4(&vResult);
            switch (lReadyState)
            {   
            case READYSTATE_UNINITIALIZED:
            case READYSTATE_LOADING: 
            case READYSTATE_LOADED: 
            case READYSTATE_INTERACTIVE:
                break;
            case READYSTATE_COMPLETE: 
                // IE4 is finished parsing the file
                BOOL fRet = PostThreadMessage(GetCurrentThreadId(),
                                WM_USER_LOAD_COMPLETE, (WPARAM)0, (LPARAM)0);
                break;
            }
            VariantClear(&vResult);
        }
    }
    return NOERROR;
}
STDMETHODIMP HTMLParser::Invoke(DISPID dispIdMember, REFIID riid, LCID lcid,
            WORD wFlags, DISPPARAMS __RPC_FAR *pDispParams,
            VARIANT __RPC_FAR *pVarResult, EXCEPINFO __RPC_FAR *pExcepInfo,
            UINT __RPC_FAR *puArgErr)
{
   if (!pVarResult)
    {
        return E_POINTER;
    }
    switch(dispIdMember)
    {
    case DISPID_AMBIENT_DLCONTROL:
        // This tells IE4 that you want to download the page, but you don't 
        // want to run scripts, Java applets, or ActiveX controls
        V_VT(pVarResult) = VT_I4;
        V_I4(pVarResult) =  DLCTL_DOWNLOADONLY | 
                            DLCTL_NO_SCRIPTS | 
                            DLCTL_NO_JAVA |
                            DLCTL_NO_DLACTIVEXCTLS |
                            DLCTL_NO_RUNACTIVEXCTLS;
        break;
    default:
        return DISP_E_MEMBERNOTFOUND;
    }
    return NOERROR;
}
BOOL HTMLParser::LoadHTMLFile(LPCSTR pcszFile)
{
    HRESULT        hr;
    LPPERSISTFILE  pPF;
    IHTMLElementCollection* pColl = NULL;
    MSG msg;
    if ( !IsConnected() )
        return FALSE;
    // kill any previous links
    if ( m_pAnchorLinks )
    {
        m_pAnchorLinks->Release();
        m_pAnchorLinks = NULL;
    }
    if ( m_pImageLinks )
    {
        m_pImageLinks->Release();
        m_pImageLinks = NULL;
    }
    // avoid IE error msg box if the file does not exist
    if ( access(pcszFile, 0x00) != 0x00 )
    {
        return FALSE;
    }
    _bstr_t bstrFile(pcszFile);
    // use IPersistFile to load the HTML
    if ( SUCCEEDED(hr = m_pMSHTML->QueryInterface(IID_IPersistFile, 
                                                         (LPVOID*) &pPF)))
    {
        hr = pPF->Load((LPCWSTR)bstrFile, 0);
        pPF->Release();
    }
    BOOL bOK = FALSE;
    if (SUCCEEDED(hr))
    {
        while (GetMessage(&msg, NULL, 0, 0))
        {
            // notification from OnChanged
            if (WM_USER_LOAD_COMPLETE == msg.message && NULL == msg.hwnd)
            {
                bOK = TRUE;
                break;
            }
            else
            {
                DispatchMessage(&msg);
            }
        }
    }
    if ( bOK )
    {
        try
        {
            if ( FAILED(m_pMSHTML->get_links(&m_pAnchorLinks)) ||
                 FAILED(m_pMSHTML->get_images(&m_pImageLinks)) ) 
            {
                throw exception();
            }
        } 
        catch ( exception e )
        {
            if ( m_pAnchorLinks )
            {
                m_pAnchorLinks->Release();
                m_pAnchorLinks = NULL;
            }
            if ( m_pImageLinks )
            {
                m_pImageLinks->Release();
                m_pImageLinks = NULL;
            }
            bOK = FALSE;
        }
    }
    return bOK;
}
/* Get the number of links present in the current HTML file */
long HTMLParser::GetLinkCount()
{
    long lCount = 0;
    if ( m_pAnchorLinks )
        m_pAnchorLinks->get_length(&lCount);
    return lCount;
}
/* Get the number of images present in the current HTML file */
long HTMLParser::GetImageCount()
{
    long lCount = 0;
    if ( m_pImageLinks )
        m_pImageLinks->get_length(&lCount);
    return lCount;
}
/* Get the URL associated with a given link */
BOOL HTMLParser::GetLinkURL(long lIndex, string &rstrURL)
{
    if ( IsConnected() && m_pAnchorLinks )
        return GetURLFromCollection(m_pAnchorLinks, 
                                IID_IHTMLAnchorElement, lIndex, rstrURL);
    else
        return FALSE;
}
/* Get the URL associated with a given image */
BOOL HTMLParser::GetImageURL(long lIndex, string &rstrURL)
{
    if ( IsConnected() && m_pImageLinks )
        return GetURLFromCollection(m_pImageLinks, IID_IHTMLImgElement, 
                                                         lIndex, rstrURL);
    else
        return FALSE;
}
/* Get URL associated with an element in a collection. Element must be an 
image or an anchor. */
BOOL HTMLParser::GetURLFromCollection(IHTMLElementCollection *pCollection, 
                                  REFIID rIID, long lIndex, string &rstrURL)
{
    VARIANT     varIndex;
    VARIANT     var2;
    HRESULT     hr;
    IDispatch*  pDisp = NULL; 
    BOOL        bFound = FALSE;

    varIndex.vt = VT_UINT;
    varIndex.lVal = lIndex;
    VariantInit( &var2 );
    hr = pCollection->raw_item( varIndex, var2, &pDisp );

    if ( SUCCEEDED(hr) && pDisp)
    {
        IHTMLImgElement* pImgElem = NULL;
        IHTMLAnchorElement* pAnchorElem = NULL;
        BSTR bstr = NULL;
        if ( rIID == IID_IHTMLImgElement &&             
             SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pImgElem)) )
        {
            pImgElem->get_href(&bstr);
            pImgElem->Release();
            bFound = (bstr != NULL);
        }
        else if ( rIID == IID_IHTMLAnchorElement &&             
                  SUCCEEDED(pDisp->QueryInterface(rIID, 
                                              (void **)&pAnchorElem)) )
        {
            pAnchorElem->get_href(&bstr);
            pAnchorElem->Release();
            bFound = (bstr != NULL);
        }

       pDisp->Release();
        if ( bFound && bstr )
        {
            // _bstr_t wrapper will delete since fCopy is FALSE
            _bstr_t bstrHREF(bstr, FALSE);
            rstrURL = (LPCSTR)bstrHREF; 
        }
    }
    return bFound;
}




8


