JSOUP通过HTTP抓取网页时能够正常工作,但是抓取HTTPS网页时,它会引发以下异常:
javax.net.ssl.SSLHandshakeException: java.security.cert.CertPathValidatorException: Trust anchor for certification path not found.
通过以下代码,可以实现HTTPS网页的抓取,让我们的站点信任所有站点,不需要引包,系统自带ssl证书校验:
import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.security.SecureRandom; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import javax.net.ssl.X509TrustManager; public class HttpsUtil { /** * 日志 */ private static final Logger logger = LoggerFactory.getLogger(HttpsUtil.class); /** * 信任任何站点,实现https页面的正常访问 */ public static void trustEveryone() { try { HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() { public boolean verify(String hostname, SSLSession session) { return true; } }); SSLContext context = SSLContext.getInstance("TLS"); context.init(null, new X509TrustManager[]{new X509TrustManager() { public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; } }}, new SecureRandom()); HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory()); } catch (Exception e) { // e.printStackTrace(); } } }
最后直接在调用JSOUP请求HTTPS的前面调用即可:
String pageurl = "https://www.baidu.com"; logger.info("〓〓〓〓〓获取的URL:{}",pageurl); Document doc = JsoupUtil.getHtmlDoc(pageurl);
转载请注明:前端收藏 » JSOUP 抓取HTTPS/HTTP网页校验问题