1.需要引入Jsoup包
Jsoup包中的parse方法只有File和INputstream能传入charset,这里自己增加一个String类型的传入charset,解决爬取网页数据过程中返回的Unicode转成乱码
package com.isoft.util;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
public class IsJsoup {
public static Document parse(String html, String charsetName, String uri) throws IOException {
ByteBuffer byteData = ByteBuffer.wrap(html.getBytes(charsetName));
return parseByteData(byteData, charsetName, uri, Parser.htmlParser());
}
public static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
String docData = null;
Document doc = null;
if (charsetName == null) {
} else {
Validate.notEmpty(charsetName,
"Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
docData = Charset.forName(charsetName).decode(byteData).toString();
}
if (doc == null) {
if (docData.length() > 0 && docData.charAt(0) == 65279)
docData = docData.substring(1);
doc = parser.parseInput(docData, baseUri);
doc.outputSettings().charset(charsetName);
}
return doc;
}
}