JSOUP通过HTTP抓取网页时能够正常工作,但是抓取HTTPS网页时,它会引发以下异常:
javax.net.ssl.SSLHandshakeException: java.security.cert.CertPathValidatorException: Trust anchor for certification path not found.
通过以下代码,可以实现HTTPS网页的抓取,让我们的站点信任所有站点,不需要引包,系统自带ssl证书校验:
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;
public class HttpsUtil {
/**
* 日志
*/
private static final Logger logger = LoggerFactory.getLogger(HttpsUtil.class);
/**
* 信任任何站点,实现https页面的正常访问
*/
public static void trustEveryone() {
try {
HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
});
SSLContext context = SSLContext.getInstance("TLS");
context.init(null, new X509TrustManager[]{new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}}, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
// e.printStackTrace();
}
}
}
最后直接在调用JSOUP请求HTTPS的前面调用即可:
String pageurl = "https://www.baidu.com";
logger.info("〓〓〓〓〓获取的URL:{}",pageurl);
Document doc = JsoupUtil.getHtmlDoc(pageurl);
转载请注明:前端收藏 » JSOUP 抓取HTTPS/HTTP网页校验问题