package ADSCRAWLER; import java.net.*; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.StringTokenizer; import javax.net.ssl.HttpsURLConnection; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.io.*; public class ADSCRAWLER { /* * READ LIST OF PUBLISHERS * CONNECT TO SITE AND READ ADS.TXT * FIND MATCHING INDEX ENTRIES AND RECORD. * SITE, ID, ID, ID * REPORT ANY SITE WITH NO ADS.TXT */ //private static String[] site = {"https://nypost.com/", "http://yawnpilch.com/", "https://ctvnews.ca/", "https://yahoo.com/"}; //private static String[] site = {"nypost.com/", "yawnpilch.com/", "ctvnews.ca/", "yahoo.com/"}; private static String adstxt = "ads.txt"; private static String sitelist = "C:///JAVA/ADSTXTCRAWLER/sitelist.txt"; //will hold list of sites that will be checked. private static String fileMissingadstext = "C:///JAVA/ADSTXTCRAWLER/MISSINGADSTXT.txt"; private static String finalreport = "C:///JAVA/ADSTXTCRAWLER/ADSTEXTREPORT.txt"; private static String fileNameADSTXTIDINVALID = "C:///JAVA/ADSTXTCRAWLER/ADSTXTIDINVALID.txt"; private static String fileNameAsitenotsecure = "C:///JAVA/ADSTXTCRAWLER/SITENOTSECURE.txt"; private int site_counter = 0; public static void main(String[] args) { /*s * * HERE WE NEED TO READ https://cdn.indexexchange.com/sellers.json * MAKE HASH OF ID'S * CHECK THAT ALL IDS FOUND ON ADS.TXT ARE ON SELLERS.JSON * */ ArrayList idsfinal = readsellersjson(); HashMap domains = new HashMap(); //HOLDS LIST OF SITES HashMap domainsmissing = new HashMap(); //HOLDS LIST OF SITES MISSING ADS.TXT HashMap domainswrongid = new HashMap(); //HOLDS LIST OF SITES MISSING ADS.TXT HashMap domainsnotsecure = new HashMap(); //HOLDS LIST OF SITES MISSING ADS.TXT List mysites = new ArrayList(); /* * READ SITE LIST FILE THAT WILL BE CHECKED */ try { //make our file writers File file = new File(sitelist); FileReader fr = new FileReader(file); BufferedReader br = new BufferedReader(fr); String line; while((line = br.readLine()) != null){ //process the line mysites.add(line); } br.close(); } catch(Exception eregd) { } System.out.println("Starting Ads.txt Crawler..."); try { BufferedWriter writermissingads = new BufferedWriter(new FileWriter(fileMissingadstext, true)); BufferedWriter writerfinalreport = new BufferedWriter(new FileWriter(finalreport, true)); BufferedWriter writernotsecure = new BufferedWriter(new FileWriter(fileNameAsitenotsecure, true)); for(int x =0; x < mysites.size(); x++) { System.out.println("Checking ...https://"+ mysites.get(x)); //String results = readfile("https://www."+mysites.get(x)); //test https String results = readfile("https://www."+mysites.get(x)); //test https //STORE ALL SITES MISSING ADS.TXT if(results.equalsIgnoreCase("Missing")) { //try runnning http System.out.println("Checking ...http://"+ mysites.get(x)); // String results2 = readfile("http://www."+mysites.get(x)); //test https String results2 = readfile("http://www."+mysites.get(x)); //test https if(results2.equalsIgnoreCase("Missing")) { //domainsmissing.put(mysites.get(x), "Missing"); writermissingads.write(mysites.get(x)); writermissingads.flush(); writermissingads.newLine(); } else { // domains.put(mysites.get(x), results2); writerfinalreport.write(mysites.get(x) +","+ results2); writerfinalreport.flush(); writerfinalreport.newLine(); //we know the connection worked with http so store unsecure site list // domainsnotsecure.put(mysites.get(x), "Not Secure"); writernotsecure.write(mysites.get(x)); writernotsecure.flush(); writernotsecure.newLine(); } System.out.println(mysites.get(x) + " " + results2); } else { //LOOP OVER ArrayList idsfinal AND CHECK ALL IDS ARE FOUND IF NOT WRITE TO FILE ADSTXTERROR.TXT // results is a string need to split on , contains will ArrayList. contains() System.out.println(mysites.get(x) + " " + results); writerfinalreport.write(mysites.get(x) +","+ results); writerfinalreport.flush(); writerfinalreport.newLine(); //domains.put(mysites.get(x), results); } } writermissingads.close(); writerfinalreport.close(); writernotsecure.close(); } catch(Exception sdfsdf) { } System.out.println("Done..."); } private static boolean testURL(String domain) { String https_url = "https://"+domain; URL url; try { url = new URL(https_url); HttpsURLConnection con = (HttpsURLConnection)url.openConnection(); return true; } catch (Exception er) { } return false; } private static String readfile(String sitename) { String results = ""; int counter =0; try { String inputLine; sitename = sitename +"/"+adstxt; URL oracle = new URL(sitename); BufferedReader in = new BufferedReader(new InputStreamReader(oracle.openStream())); if(in.ready()) { while ((inputLine = in.readLine()) != null) { //if(inputLine.toLowerCase().contains("indexexchange.com,")) if(inputLine.toLowerCase().contains("indexexchange.com")) { counter++; //System.out.println(inputLine); //split sting get id and append to results String[] data = inputLine.split(",", 3); //System.out.println(data[1]); if(results.isEmpty()) { results = data[1]; } else { results = results+","+data[1]; } } } } in.close(); } catch (Exception erer) { //file not found need to records this results = "Missing"; } if(counter > 0) { return results; } else { return "Missing"; } } private static ArrayList readsellersjson() { String ids = ""; ArrayList idsfinal = new ArrayList<>(); try { String inputLine; URL oracle = new URL("https://cdn.indexexchange.com/sellers.json"); BufferedReader in = new BufferedReader(new InputStreamReader(oracle.openStream())); while ((inputLine = in.readLine()) != null) { if(inputLine.toLowerCase().contains("seller_id")) { //"seller_id": "189314", //SPLIT AND SAVE ID String[] data = inputLine.split(":", 2); //strip off first and last char data[1] = data[1].replace(",", ""); data[1] = data[1].substring(0, data[1].length() - 1); data[1] = data[1].substring(2); //System.out.println(data[1]); idsfinal.add(data[1]); } } } catch (Exception erer) { } return idsfinal; } }