/* This file is part of Webarchiver
 *  Copyright (C) 2001 by Andreas Schlapbach <schlpbch@iam.unibe.ch>
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public License
 *  along with this library; see the file COPYING.LIB.  If not, write to
 *  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 *  Boston, MA 02111-1307, USA.
 **/

/* $Id: plugin_webarchiver.cpp,v 1.23.2.1 2003/02/01 09:35:48 lukas Exp $ */

/*
 * There are two recursions within this code:
 * - Recursively create DOM-Tree for referenced links which get recursively
 *   converted to HTML
 *
 * => This code has the potential to download whole sites to a TarGz-Archive
 */

//#define DEBUG_WAR

#include <assert.h>

#include <qdir.h>
#include <qfile.h>

#include <kaction.h>
#include <kinstance.h>
#include <ktempfile.h>

#include <kfiledialog.h>
#include <kmessagebox.h>
#include <klocale.h>
#include <kio/netaccess.h>
#include <khtml_part.h>
#include <kdebug.h>
#include <kgenericfactory.h>

#include "plugin_webarchiver.h"

typedef KGenericFactory<PluginWebArchiver> PluginWebArchiverFactory;
K_EXPORT_COMPONENT_FACTORY( libwebarchiverplugin,
	                    PluginWebArchiverFactory( "webarchiver" ) );

PluginWebArchiver::PluginWebArchiver( QObject* parent, const char* name,
                                      const QStringList & )
  : Plugin( parent, name ), m_bPreserveWS(false)
{
  (void) new KAction( i18n("Ar&chive Web Page..."),
                      "webarchiver", 0,
                      this, SLOT(slotSaveToArchive()),
                      actionCollection(), "archivepage" );
}

PluginWebArchiver::~PluginWebArchiver()
{
}

void PluginWebArchiver::slotSaveToArchive()
{
  // ## Unicode ok?
  if( !parent() || !parent()->inherits("KHTMLPart"))
    return;
  KHTMLPart *part = static_cast<KHTMLPart *>( parent() );

  QString archiveName = QString::fromUtf8(part->htmlDocument().title().string().utf8());

  if (archiveName.isEmpty())
    archiveName = i18n("Untitled");

  // Replace space with underscore, proposed Frank Pieczynski <pieczy@knuut.de>
  archiveName = archiveName.simplifyWhiteSpace().replace( QRegExp("\\s+"), "_");

  archiveName = QDir::homeDirPath() + "/" + archiveName + ".war"; //Thanks ade

  KURL url = KFileDialog::getSaveFileName(archiveName, QString::fromLatin1("*.tgz *.war|")+
                                          i18n("Web Archives"), part->widget(),
					  i18n("Save Page as Web-Archive") );

  if (url.isEmpty()) { return; }

  if (!(url.isValid())) {
    const QString title = i18n( "Invalid URL" );
    const QString text = i18n( "This URL \n %1 \n is not valid." ).arg(url.prettyURL());
    KMessageBox::sorry(part->widget(), text, title );
    return;
  }

  const QFile file(url.path());
  if (file.exists()) {
    const QString title = i18n( "File exists" );
    const QString text = i18n( "Do you really want to overwrite: \n%1?" ).arg(url.prettyURL());
    if (KMessageBox::Yes != KMessageBox::warningYesNoCancel( part->widget(), text, title ) ) {
      return;
    }
  }
  m_tarBall = new KTarGz(url.path(),"application/x-gzip");

  if (m_tarBall->open(IO_WriteOnly)) {
#ifdef DEBUG_WAR
    kdDebug() << "Web Archive opened " << endl;
#endif

    // Bookkeeping of downloaded URLs and links
    m_downloadedURLDict = new QDict<QString> ( 17, true );
    m_linkDict = new QDict<QString> ( 17, true );

    m_linkDict->insert("index.html", new QString(""));
    saveFile("index.html", part);

    delete m_linkDict;
    delete m_downloadedURLDict;

    m_tarBall->close();

    KMessageBox::information(part->widget(), i18n( "Archiving webpage completed." ),
			     QString::null, QString::null, false);
#ifdef DEBUG_WAR
    kdDebug() << "Web Archive closed " << endl;
#endif

  } else {
    const QString title = i18n( "Unable to open Web-Archive" );
    const QString text = i18n( "Unable to open \n %1 \n for writing." ).arg(url.prettyURL());
    KMessageBox::sorry( part->widget(), text, title );
  }
  delete m_tarBall;
}

/* Store the HTMLized DOM-Tree to a temporary file and add it to the Tar-Ball */

void PluginWebArchiver::saveFile( const QString& fileName,  const KHTMLPart *part)
{
  KTempFile tmpFile;
  if (!(tmpFile.status())) {
    QTextStream* textStream = tmpFile.textStream();
    textStream->setEncoding(QTextStream::Locale);
    this->saveToArchive(part, textStream);

    tmpFile.close();

    QFile file(tmpFile.name());
    file.open(IO_ReadOnly);
    m_tarBall->writeFile(fileName, QString::null, QString::null, file.size(), file.readAll());
#ifdef DEBUG_WAR
    kdDebug() << "HTML-file written: " << fileName << endl;
#endif
    file.close();

    // Cleaning up
    file.remove();
  } else {
    const QString title = i18n( "Couldn't open a temporary file" );
    const QString text = i18n( "Couldn't open a temporary file" );
    KMessageBox::sorry( 0, text, title );
  }
}

/* Recursively travers the DOM-Tree */

void PluginWebArchiver::saveToArchive(const KHTMLPart *part, QTextStream* _textStream)
{
  assert(_textStream);

  // Add a doctype

  (*_textStream) <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">" << endl;

  const DOM::Node &pNode = part->document();
  if(pNode.ownerDocument().isNull()) {
    saveArchiveRecursive(pNode, part->url(), _textStream, 0);
  } else {
    saveArchiveRecursive(pNode.ownerDocument(), part->url(), _textStream, 0);
  }
}

/* Transform DOM-Tree to HTML */

void PluginWebArchiver::saveArchiveRecursive(const DOM::Node &pNode, const KURL& baseURL,
					     QTextStream* _textStream, int indent)
{
  const QString nodeName(pNode.nodeName().string());
  QString text;
  QString strIndent;
  strIndent.fill(' ', indent);
  const DOM::Element element = (const DOM::Element) pNode;
  DOM::Node child;

  if ( !element.isNull() ) {
    if (nodeName.at(0)=='-') {
      /* Don't save khtml internal tags '-konq..'
       * Approximating it with <DIV>
       */
      text += "<DIV> <!-- -KONQ_BLOCK -->";
    } else if (nodeName == "BASE") {
      /* Skip BASE, everything is relative to index.html
       * Saving SCRIPT but they can cause trouble!
       */
    } else {
      if (!m_bPreserveWS) {
	if (nodeName == "PRE") {
	  m_bPreserveWS = true;
	}
	text = strIndent;
      }
      text += "<" + nodeName;
      QString attributes;
      QString attrName, attrValue;
      DOM::Attr attr;
      DOM::NamedNodeMap attrs = element.attributes();
      unsigned long lmap = attrs.length();
      for( unsigned int j=0; j<lmap; j++ ) {
	attr = static_cast<DOM::Attr>(attrs.item(j));
	attrName = attr.name().string();
	attrValue = attr.value().string();

#if 0
	if ((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") {
	  //attrValue = handleLink(baseURL, attrValue);

	  /* Going recursively down creating a DOM-Tree for the Frame, second Level of recursion */
	  //## Add Termination criteria, on the other hand frames are not indefinetly nested, are they :)

	  KHTMLPart* part = new KHTMLPart();
	  KURL absoluteURL = getAbsoluteURL(baseURL, attrValue);
	  part->openURL(absoluteURL);
	  saveFile(getUniqueFileName(absoluteURL.fileName()), part);
	  delete part;

	} else if
#endif
	if ((nodeName == "LINK" && attrName == "HREF") || // Down load stylesheets, js-script, ..
	    ((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") ||
	    ((nodeName == "IMG" || nodeName == "INPUT" || nodeName == "SCRIPT") && attrName == "SRC") ||
	    ((nodeName == "BODY" || nodeName == "TABLE" || nodeName == "TH" || nodeName == "TD") && attrName == "BACKGROUND")) {
	   // Some people use carriage return in file names and browsers support that!
	   attrValue = handleLink(baseURL, attrValue.replace(QRegExp("\\s"), ""));
	}
	/*
	 * ## Make recursion level configurable
	 */
	/*
	} else if (nodeName == "A" && attrName == "HREF") {
	   attrValue = handleLink(baseURL, attrValue);
	*/

	attributes += " " + attrName + "=\"" + attrValue + "\"";
      }
      if (!(attributes.isEmpty())){
	text += " ";
      }
    text += attributes.simplifyWhiteSpace();
    text += ">";
    }
  } else {
    const QString& nodeValue(pNode.nodeValue().string());
    if (!(nodeValue.isEmpty())) {
      // Don't escape < > in JS or CSS
      QString parentNodeName = pNode.parentNode().nodeName().string();
      if (parentNodeName == "STYLE") {
	text = analyzeInternalCSS(baseURL, pNode.nodeValue().string());
      } else if (m_bPreserveWS) {
	text = stringToHTML(pNode.nodeValue().string());
      } else if (parentNodeName == "SCRIPT") {
	text = pNode.nodeValue().string();
      } else {
	text = strIndent + stringToHTML(pNode.nodeValue().string());
      }
    }
  }

#ifdef DEBUG_WAR
  kdDebug() <<  text << endl;
#endif
  if (!(text.isEmpty())) {
    (*_textStream) << text;
    if (!m_bPreserveWS) {
      (*_textStream) << endl;
    }
  }

  try
  {
    // We might throw a DOM exception
    child = pNode.firstChild();
  }
  catch (...)
  {
    // No children, stop recursion here
    child = DOM::Node();
  }

  while(!child.isNull()) {
    saveArchiveRecursive(child, baseURL, _textStream, indent+2);
    child = child.nextSibling();
  }

  if (!(element.isNull())) {
    if (nodeName == "AREA" || nodeName == "BASE" || nodeName == "BASEFONT" ||
	nodeName == "BR" || nodeName == "COL" || nodeName == "FRAME" ||
	nodeName == "HR" || nodeName == "IMG" || nodeName == "INPUT" ||
	nodeName == "ISINDEX" || nodeName == "META" || nodeName == "PARAM") {

      /* Closing Tag is forbidden, see HTML 4.01 Specs: Index of Elements  */

    } else {
      if (!m_bPreserveWS) {
	text = strIndent;
      } else {
	text ="";
      }
      if (nodeName.at(0)=='-') {
	text += "</DIV> <!-- -KONG_BLOCK -->";
      } else {
	text += "</" + pNode.nodeName().string() + ">";
	if (nodeName == "PRE") {
	  m_bPreserveWS = false;
	}
      }
#ifdef DEBUG_WAR
      kdDebug() << text << endl;
#endif
      if (!(text.isEmpty())) {
	(*_textStream) << text;
	if (!m_bPreserveWS) {
	  (*_textStream) << endl;
	}
      }
    }
  }
}

/* Extract the URL, download it's content and return an unique name for the link */

QString PluginWebArchiver::handleLink(const KURL& _url, const QString& _link)
{
  KURL url(getAbsoluteURL(_url, _link));

#ifdef DEBUG_WAR
  kdDebug() << "Link: " << _link << endl;
  kdDebug() << "URL : " << url.url() << endl;
#endif

  QString fileName;
  QString tarFileName;

  // Only download file once
  if (m_downloadedURLDict->find(url.url())) {
    tarFileName = *m_downloadedURLDict->find(url.url());
#ifdef DEBUG_WAR
    kdDebug() << "File allready downloaded: " << url.url() << endl;
#endif
  } else {
    if (KIO::NetAccess::download( url, fileName )) {
#ifdef DEBUG_WAR
      kdDebug() << "FileName: " << fileName << endl;
#endif
      tarFileName = getUniqueFileName(url.fileName());

      // Add file to Tar-Ball
      QFile file(fileName);
      file.open(IO_ReadOnly);
      m_tarBall->writeFile(tarFileName, QString::null, QString::null, file.size(), file.readAll());
      file.close();

      // Add URL to downloaded URLs
      m_downloadedURLDict->insert(url.url(), new QString(tarFileName));
      m_linkDict->insert(tarFileName, new QString(""));

      //Cleaning up
      KIO::NetAccess::removeTempFile( fileName );
    } else {
#ifdef DEBUG_WAR
      kdDebug() << "Couldn't download file: " << url.url() << endl;
#endif
    }
  }
#ifdef DEBUG_WAR
  kdDebug() << "TarFileName: [" << tarFileName << "]" << endl << endl;
#endif
  return tarFileName;
}

/* Create an absolute URL for download */

KURL PluginWebArchiver::getAbsoluteURL(const KURL& _url, const QString& _link)
{
  // Does all the magic for me
  return KURL(_url, _link);
}

/* Adds an id to a fileName to make it unique relative to the Tar-Ball */

QString PluginWebArchiver::getUniqueFileName(const QString& fileName)
{
  // Name clash -> add unique id
  static int id;
  QString uniqueFileName(fileName);

#ifdef DEBUG_WAR
  kdDebug() << "getUniqueFileName(..): [" << fileName << "]" << endl;
#endif

  if (fileName.isEmpty() || m_linkDict->find(fileName)) {
    QString strId;
    uniqueFileName = strId.setNum(id) + uniqueFileName;
    id++;
  }
  return uniqueFileName;
}

/* Escape < and > tags */

QString PluginWebArchiver::stringToHTML(const QString& string)
{
  QString str(string);
  str.replace( QRegExp("<"), "&lt;");
  str.replace( QRegExp(">"), "&gt;");
  return str;
}

/* Search for Images in CSS, extract them and adjust CSS */

QString PluginWebArchiver::analyzeInternalCSS(const KURL& _url, const QString& string)
{
#ifdef DEBUG_WAR
  kdDebug ()  << "analyzeInternalCSS" << endl;
#endif

  QString str(string);
  int pos = 0;
  int startUrl = 0;
  int endUrl = 0;
  int length = string.length();
  while (pos < length && pos >= 0) {
    pos = str.find("url(", pos);
    //kdDebug() << pos << endl;
    if (pos!=-1) {
      pos += 4; // url(

      if (str[pos]=='"' || str[pos]=='\'') // CSS 'feature'
	pos++;
      //kdDebug() << pos << endl;
      startUrl = pos;
      pos = str.find(")",startUrl);
      //kdDebug() << pos << endl;
      endUrl = pos;
      if (str[pos-1]=='"' || str[pos-1]=='\'') // CSS 'feature'
	endUrl--;
      QString url = str.mid(startUrl, endUrl-startUrl);

#ifdef DEBUG_WAR
      kdDebug () << "url: " << url << endl;
#endif

      url = handleLink(_url, url);

#ifdef DEBUG_WAR
      kdDebug () << "url: " << url << endl;
#endif

      str = str.replace(startUrl, endUrl-startUrl, url);
      pos++;
    }
  }
  return str;
}

#include <plugin_webarchiver.moc>
