package TESTTHIS; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.util.ArrayList; import java.util.List; import java.net.*; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.net.*; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.StringTokenizer; import javax.net.ssl.HttpsURLConnection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.io.*; //THIS WILL USE JSOUP TO TRY AND READ THE SITES WE COULD NOT READ WITH THE FIRST CRAWLER public class ADSTEXTCRAWLERJSOUP { public static void main(String[] args) { List mysites = new ArrayList(); String testsite = "https://www.yahoo.com/ads.txt"; String finalreport = "C:///JAVA/ADSTXTCRAWLER/ADSTEXTREPORT2.txt"; String sitelist = "C:///JAVA/ADSTXTCRAWLER/uniques.txt"; //list of sites we could not get working String ids = ""; int isfirst = 0; int timeout = 8000; try { //load site list to check File file = new File(sitelist); FileReader fr = new FileReader(file); BufferedReader br = new BufferedReader(fr); String line; while((line = br.readLine()) != null){ //process the line mysites.add(line); } br.close(); } catch(Exception fdfd) { System.out.println("Crashed because " + fdfd ); } for(int i =0; i < mysites.size(); i++) { try { testsite = "https://"+mysites.get(i)+"/ads.txt"; System.out.println("Checking ...https://"+ mysites.get(i)); Document doc = Jsoup.connect(testsite).timeout(timeout).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").ignoreHttpErrors(true).get(); BufferedWriter writerfinalreport = new BufferedWriter(new FileWriter(finalreport, true)); String body = doc.body().text(); System.out.printf("Body: %s", body); String[] arrOfStr = body.split(","); ids = ""; for(int x=0; x < arrOfStr.length;x++) { //if(arrOfStr[x].contains("Indexexchange.com") || arrOfStr[x].toLowerCase().contains("indexexchange.com") ) //we need to get the next id and write to file if(arrOfStr[x].toLowerCase().contains("indexexchange.com")) //we need to get the next id and write to file { isfirst++; System.out.println(arrOfStr[x] + " " + arrOfStr[x+1]); //arrOfStr[x+1] is id if(isfirst == 0) { ids = arrOfStr[x+1]; } else { ids = ids +","+arrOfStr[x+1]; } //System.out.println(arrOfStr[x++]); } } System.out.println(testsite + "" +ids); writerfinalreport.newLine(); writerfinalreport.write(testsite + "" +ids); writerfinalreport.flush(); writerfinalreport.newLine(); } catch(Exception fdfdedfdd) { System.out.println("Crashed because " + fdfdedfdd ); } } // TODO Auto-generated method stub } //https://estar.jp/ads.txt, 189070, 183965, 189236,175407 }