/* Copyright (C) 2003-2008 The University of Iowa 
 *
 * This file is part of the Das2 <www.das2.org> utilities library.
 *
 * Das2 utilities are free software: you can redistribute and/or modify them
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or (at your
 * option) any later version.
 *
 * Das2 utilities are distributed in the hope that they will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * as well as the GNU General Public License along with Das2 utilities.  If
 * not, see <http://www.gnu.org/licenses/>.
 *
 * HtmlUtil.java
 *
 * Created on May 14, 2004, 9:06 AM
 */

package org.das2.util.filesystem;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.util.logging.Level;
import org.das2.util.monitor.CancelledOperationException;
import org.das2.util.Base64;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.das2.util.FileUtil;
import org.das2.util.LoggerManager;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 * HTML utilities, such as getting a directory listing, where a "file" is a link
 * below the directory we are listing, and read a URL into a String.
 * @author  Jeremy
 */
public class HtmlUtil {

    private final static Logger logger= LoggerManager.getLogger( "das2.filesystem.htmlutil" );
    /**
     * this logger is for opening connections to remote sites.
     */
    protected static final Logger loggerUrl= org.das2.util.LoggerManager.getLogger( "das2.url" );
    
    public static boolean isDirectory( URL url ) {
        String file= url.getFile();
        return file.charAt(file.length()-1) != '/';
    }

    /**
     * nice clients consume both the stderr and stdout coming from websites.
     * This reads everything off of the stream and closes it.
     * http://docs.oracle.com/javase/1.5.0/docs/guide/net/http-keepalive.html suggests that you "do not abandon connection"
     * @param err the input stream
     * @throws IOException 
     * @see HttpUtil#consumeStream(java.io.InputStream) 
     */
    @Deprecated
    public static void consumeStream( InputStream err ) throws IOException {
        HttpUtil.consumeStream(err);
    }
    
    private static URL[] getDirectoryListingAmazonS3( URL root, String content ) {
        try {
            Reader reader = new StringReader(content);
            DocumentBuilder builder= DocumentBuilderFactory.newInstance().newDocumentBuilder();
            InputSource source = new InputSource(reader);
            Document document= builder.parse(source);
            
            XPathFactory factory= XPathFactory.newInstance();
            javax.xml.xpath.XPath xpath= factory.newXPath();
            NodeList fs= (NodeList) xpath.evaluate( "/ListBucketResult/Contents/Key", document, XPathConstants.NODESET );
            
            int n= fs.getLength();
            URL[] result= new URL[n];
            for ( int i=0; i<n; i++ ) {
                org.w3c.dom.Node nn= fs.item(i);
                try {
                    result[i]= new URL( root, nn.getTextContent() );
                } catch ( Exception e ) {
                    
                }
            }
            return result;
        } catch (SAXException | IOException | XPathExpressionException | ParserConfigurationException ex) {
            throw new RuntimeException(ex);
        }
    }
    
    /**
     * Get the listing of the web directory, returning links that are "under" the given URL.
     * Note this does not handle off-line modes where we need to log into
     * a website first, as is often the case for a hotel.
     *
     * This was refactored to support caching of listings by simply writing the content to disk.
     *
     * @param url the address.
     * @param urlStream stream containing the URL content, which must be UTF-8 (or US-ASCII)
     * @return list of URIs referred to in the page.
     * @throws IOException
     * @throws CancelledOperationException
     */
    public static URL[] getDirectoryListing( URL url, InputStream urlStream ) throws IOException, CancelledOperationException {    
        return getDirectoryListing( url, urlStream, true );
    }
    
    /**
     * Get the listing of the web directory, returning links that are "under" the given URL.
     * Note this does not handle off-line modes where we need to log into
     * a website first, as is often the case for a hotel.
     *
     * This was refactored to support caching of listings by simply writing the content to disk.
     *
     * @param url the address.
     * @param urlStream stream containing the URL content, which must be UTF-8 (or US-ASCII)
     * @param childCheck only return links to URLs "under" the url.
     * @return list of URIs referred to in the page.
     * @throws IOException
     * @throws CancelledOperationException
     */
    public static URL[] getDirectoryListing( URL url, InputStream urlStream, boolean childCheck ) throws IOException, CancelledOperationException {
        // search the input stream for links
        // first, read in the entire URL

        long t0= System.currentTimeMillis();
        byte b[] = new byte[10000];
        int numRead = urlStream.read(b);  // i18n  also decorator in script to make plot
        StringBuilder contentBuffer = new StringBuilder( 10000 );

        if ( numRead!=-1 ) contentBuffer.append( new String( b, 0, numRead, "UTF-8" ) );
        while (numRead != -1) {
            logger.finest("download listing");
            numRead = urlStream.read(b);
            if (numRead != -1) {
                String newContent = new String(b, 0, numRead, "UTF-8");
                contentBuffer.append( newContent );
            }
        }
        urlStream.close();

        logger.log(Level.FINER, "read listing data in {0} millis", (System.currentTimeMillis() - t0));
        String content= contentBuffer.toString();

        //logger.log(Level.WARNING, "listing length (bytes): {0} {1}", new Object[]{content.length(), url});
        
        if ( content.startsWith("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<ListBucketResult")) {
            return getDirectoryListingAmazonS3( url, content );
        }
        //TODO: use getLinks to get all the links.
        
        String hrefRegex= "(?i)href\\s*=\\s*([\"'])(.+?)\\1";
        Pattern hrefPattern= Pattern.compile( hrefRegex );

        Matcher matcher= hrefPattern.matcher( content );

        ArrayList urlList= new ArrayList();

        String surl= url.toString();

        while ( matcher.find() ) {
            
            String strLink= matcher.group(2);
            logger.log(Level.FINEST, "parse listing {0}", strLink);
            URL urlLink;

            try {
                urlLink = new URL(url, URLDecoder.decode(strLink,"UTF-8") );
                strLink = urlLink.toString();
                if ( strLink.contains("data-item-type=") ) { // kludge for https://github.com/autoplot/dev/master/bugs/sf/2376/
                    continue;
                }
                if ( strLink.contains("#") ) {
                    continue;   // get rid of many https://abbith.physics.uiowa.edu/assets/icons-2cb47a6dce56387af715816406f3f0d5d68651436bd5c96807123fcf421ad07d.svg#chevron-down
                }
            } catch (MalformedURLException e) {
                logger.log(Level.SEVERE, "bad URL: {0} {1}", new Object[]{url, strLink});
                continue;
            }

            if ( childCheck ) {
                if ( strLink.startsWith(surl) && strLink.length() > surl.length() && null==urlLink.getQuery() ) {
                    String file= strLink.substring( surl.length() );
                    if ( !file.startsWith("../") ) {
                        urlList.add( urlLink );
                    }
                }
            } else {
                urlList.add( urlLink );
            }
        }

        return (URL[]) urlList.toArray( new URL[urlList.size()] );

    }

    /**
     * Get the listing of the web directory, returning links that are "under" the given URL.
     * Note this does not handle off-line modes where we need to log into
     * a website first, as is often the case for a hotel.
     * @param url
     * @return list of URIs referred to in the page.
     * @throws IOException
     * @throws CancelledOperationException
     */
    public static URL[] getDirectoryListing( URL url ) throws IOException, CancelledOperationException {

        logger.log(Level.FINER, "listing {0}", url);
        
        String file= url.getFile();
        if ( file.charAt(file.length()-1)!='/' ) {
            url= new URL( url.toString()+'/' );
        }
        
        InputStream urlStream= getInputStream(url);
        
        return getDirectoryListing( url, urlStream );
    }
    
    /**
     * get the inputStream, following redirects if a 301 or 302 is encountered.  
     * The scientist may be prompted for a password, but only if "user@" is
     * in the URL.
     * 
     * Note this does not explicitly close the connections
     * to the server, and Java may not know to release the resources.  
     * TODO: fix this by wrapping the input stream and closing the connection
     * when the stream is closed.  This was done in Autoplot's DataSetURI.downloadResourceAsTempFile
     * @see org.autoplot.datasource.DataSetURI#downloadResourceAsTempFile
     * 
     * @param url
     * @return input stream
     * @throws IOException 
     * @throws org.das2.util.monitor.CancelledOperationException 
     */
    public static InputStream getInputStream( URL url ) throws IOException, CancelledOperationException {

        loggerUrl.log(Level.FINE, "getInputStream {0}", new Object[] { url } );
        
        long t0= System.currentTimeMillis();
        
        String userInfo= KeyChain.getDefault().getUserInfo(url);

        //long t0= System.currentTimeMillis();
        loggerUrl.log(Level.FINE, "openConnect {0}", new Object[] { url } );
        URLConnection urlConnection = url.openConnection();

        urlConnection.setAllowUserInteraction(false);
        urlConnection.setConnectTimeout(FileSystem.settings().getConnectTimeoutMs() );
        
        logger.log(Level.FINER, "connected in {0} millis", (System.currentTimeMillis() - t0));
        if ( userInfo != null) {
            String encode = Base64.getEncoder().encodeToString( userInfo.getBytes());
            urlConnection.setRequestProperty("Authorization", "Basic " + encode);
        }
                
        urlConnection= HttpUtil.checkRedirect( urlConnection );
        InputStream ins= urlConnection.getInputStream();
        
        boolean keepResponseForDebugging=false;
        
        if ( !keepResponseForDebugging ) {
            return ins;
        } else {
            StringBuilder keep= new StringBuilder();
            byte[] buf= new byte[4096];
            int bytesRead=ins.read(buf);
            int totalBytesRead=0;
            while ( bytesRead>0 ) {
                totalBytesRead+=bytesRead;
                for ( int i=0; i<bytesRead; i++ ) {
                    keep.append((char)buf[i]);
                }
                bytesRead=ins.read(buf);
            }
            long t02= (System.currentTimeMillis()-1718399881869L);
            File file= new File( "/tmp/ap/"+url.getFile().hashCode()+"."+ String.format("%09d",t02) +".html");
            FileUtil.writeStringToFile( file, keep.toString() );
            logger.log(Level.INFO, "writing html listing to {0}", file);
            return new FileInputStream(file);
        }
        
    }
    
    /**
     * read the contents of the URL into a string, assuming UTF-8 encoding.
     * @param url
     * @return
     * @throws IOException
     * @throws CancelledOperationException 
     */
    public static String readToString( URL url ) throws IOException, CancelledOperationException {
        InputStream ins= getInputStream( url );
        StringBuilder build= new StringBuilder();
        byte[] buf= new byte[2048];
        int i= ins.read(buf);
        Charset charset= Charset.forName("UTF-8");
        while ( i>-1 ) {
            build.append( new String( buf, 0, i, charset ) );
            i= ins.read(buf);
        }
        return build.toString();
    }

    /**
     * return the metadata about a URL.  This will support http, https,
     * and ftp, and will check for redirects.  This will 
     * allow caching of head requests.
     * @param url ftp,https, or http URL
     * @param props, if non-null, may be a map containing cookie.
     * @return the metadata
     * @throws java.io.IOException when HEAD requests are made.
     * @see HttpUtil#getMetadata(java.net.URL, java.util.Map) 
     */
    @Deprecated
    public static Map<String,String> getMetadata( URL url, Map<String,String> props ) throws IOException {
        return HttpUtil.getMetadata(url, props);
    }
    
    /**
     * check for 301, 302 or 303 redirects, and return a new connection in this case.
     * This should be called immediately before the urlConnection.connect call,
     * as this must connect to get the response code.
     * @param urlConnection if an HttpUrlConnection, check for 301 or 302; return connection otherwise.
     * @return a connection, typically the same one as passed in.
     * @throws IOException 
     * @see HttpUtil#checkRedirect(java.net.URLConnection) 
     */
    @Deprecated
    public static URLConnection checkRedirect( URLConnection urlConnection ) throws IOException {
        return HttpUtil.checkRedirect(urlConnection);
    }

    /**
     * return the links found in the content, using url as the context.
     * @param url null or the url for the context.
     * @param content the html content.
     * @return a list of URLs.
     */
    public static List<URL> getLinks( URL url, String content ) {
        String hrefRegex= "(?i)href\\s*=\\s*([\"'])(.+?)\\1";
        Pattern hrefPattern= Pattern.compile( hrefRegex );

        Matcher matcher= hrefPattern.matcher( content );

        ArrayList urlList= new ArrayList();

        while ( matcher.find() ) {
            String strLink= matcher.group(2);
            logger.log(Level.FINEST, "parse listing {0}", strLink);
            URL urlLink;

            try {
                urlLink = new URL(url, URLDecoder.decode(strLink,"UTF-8") );
                urlList.add( urlLink );
            } catch (MalformedURLException e) {
                logger.log(Level.SEVERE, "bad URL: {0} {1}", new Object[]{url, strLink});
                continue;
            } catch (UnsupportedEncodingException ex) {
                Logger.getLogger(HtmlUtil.class.getName()).log(Level.SEVERE, null, ex);
            }

        }
        return urlList;
    }
    
}