EmailHarvester.py

#!/usr/bin/env python3
# encoding: UTF-8

"""
    This file is part of EmailHarvester
    Copyright (C) 2016 @maldevel
    https://github.com/maldevel/EmailHarvester
    
    EmailHarvester - A tool to retrieve Domain email addresses from Search Engines.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    
    For more see the file 'LICENSE' for copying permission.
"""

__author__ = "maldevel"
__copyright__ = "Copyright (c) 2016 @maldevel"
__credits__ = ["maldevel", "PaulSec", "cclauss", "Christian Martorella"]
__license__ = "GPLv3"
__version__ = "1.3.2"
__maintainer__ = "maldevel"

################################

import argparse
import sys
import time
import requests
import re
import os
import validators

from termcolor import colored
from argparse import RawTextHelpFormatter
from sys import platform as _platform
from urllib.parse import urlparse

################################


if _platform == 'win32':
    import colorama
    colorama.init()

class myparser:
    
    def __init__(self):
        self.temp = []
        
    def extract(self, results, word):
            self.results = results
            self.word = word

    def genericClean(self):
        for e in '''<KW> </KW> </a> <b> </b> </div> <em> </em> <p> </span>
                    <strong> </strong> <title> <wbr> </wbr>'''.split():
            self.results = self.results.replace(e, '')
        for e in '%2f %3a %3A %3C %3D & / : ; < = > \\'.split():
            self.results = self.results.replace(e, ' ')
        
    def emails(self):
        self.genericClean()
        reg_emails = re.compile(
            '[a-zA-Z0-9.\-_+#~!$&\',;=:]+' +
            '@' +
            '[a-zA-Z0-9.-]*' +
            self.word)
        self.temp = reg_emails.findall(self.results)
        emails = self.unique()
        return emails
    
    def unique(self):
        self.new = list(set(self.temp))
        return self.new
    
###################################################################

class EmailHarvester(object):
    
    def __init__(self, userAgent, proxy):
        self.plugins = {}
        self.proxy = proxy
        self.userAgent = userAgent
        self.parser = myparser()
        self.activeEngine = "None"
        path = "plugins/"
        plugins = {}
        
        sys.path.insert(0, path)
        for f in os.listdir(path):
            fname, ext = os.path.splitext(f)
            if ext == '.py':
                mod = __import__(fname, fromlist=[''])
                plugins[fname] = mod.Plugin(self, {'useragent':userAgent, 'proxy':proxy})
    
    def register_plugin(self, search_method, functions):
        self.plugins[search_method] = functions
        
    def get_plugins(self):
        return self.plugins
    
    def show_message(self, msg):
        print(green(msg))
        
    def init_search(self, url, word, limit, counterInit, counterStep, engineName):
        self.results = ""
        self.totalresults = ""
        self.limit = int(limit)
        self.counter = int(counterInit)
        self.url = url
        self.step = int(counterStep)
        self.word = word
        self.activeEngine = engineName
        
    def do_search(self):
        try:
            urly = self.url.format(counter=str(self.counter), word=self.word)
            headers = {'User-Agent': self.userAgent}
            if(self.proxy):
                proxies = {self.proxy.scheme: "http://" + self.proxy.netloc}
                r=requests.get(urly, headers=headers, proxies=proxies)
            else:
                r=requests.get(urly, headers=headers)
                
        except Exception as e:
            print(e)
            sys.exit(4)

        if r.encoding is None:
	          r.encoding = 'UTF-8'

        self.results = r.content.decode(r.encoding)
        self.totalresults += self.results
    
    def process(self):
        while (self.counter < self.limit):
            self.do_search()
            time.sleep(1)
            self.counter += self.step
            print(green("[+] Searching in {}:".format(self.activeEngine)) + cyan(" {} results".format(str(self.counter))))
            
    def get_emails(self):
        self.parser.extract(self.totalresults, self.word)
        return self.parser.emails()
    
###################################################################

def yellow(text):
    return colored(text, 'yellow', attrs=['bold'])

def green(text):
    return colored(text, 'green', attrs=['bold'])

def red(text):
    return colored(text, 'red', attrs=['bold'])

def cyan(text):
    return colored(text, 'cyan', attrs=['bold'])

def unique(data):
        return list(set(data))

###################################################################

def checkProxyUrl(url):
    url_checked = urlparse(url)
    if (url_checked.scheme not in ('http', 'https')) | (url_checked.netloc == ''):
        raise argparse.ArgumentTypeError('Invalid {} Proxy URL (example: http://127.0.0.1:8080).'.format(url))
    return url_checked

def limit_type(x):
    x = int(x)
    if x > 0:
        return x
    raise argparse.ArgumentTypeError("Minimum results limit is 1.")

def checkDomain(value):
    domain_checked = validators.domain(value)
    if not domain_checked:
        raise argparse.ArgumentTypeError('Invalid {} domain.'.format(value))
    return value

###################################################################

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description="""

 _____                   _  _   _   _                                _              
|  ___|                 (_)| | | | | |                              | |             
| |__  _ __ ___    __ _  _ | | | |_| |  __ _  _ __ __   __ ___  ___ | |_  ___  _ __ 
|  __|| '_ ` _ \  / _` || || | |  _  | / _` || '__|\ \ / // _ \/ __|| __|/ _ \| '__|
| |___| | | | | || (_| || || | | | | || (_| || |    \ V /|  __/\__ \| |_|  __/| |   
\____/|_| |_| |_| \__,_||_||_| \_| |_/ \__,_||_|     \_/  \___||___/ \__|\___||_| 

    A tool to retrieve Domain email addresses from Search Engines | @maldevel
                                {}: {}
""".format(red('Version'), yellow(__version__)),                                 
                                     formatter_class=RawTextHelpFormatter)
    
    parser.add_argument("-d", '--domain', action="store", metavar='DOMAIN', dest='domain', 
                        default=None, type=checkDomain, help="Domain to search.")
    parser.add_argument("-s", '--save', action="store", metavar='FILE', dest='filename', 
                        default=None, type=str, help="Save the results into a TXT and XML file (both).")
    
    parser.add_argument("-e", '--engine', action="store", metavar='ENGINE', dest='engine', 
                        default="all", type=str, help="Select search engine plugin(eg. '-e google').")
    
    parser.add_argument("-l", '--limit', action="store", metavar='LIMIT', dest='limit', 
                        type=limit_type, default=100, help="Limit the number of results.")
    parser.add_argument('-u', '--user-agent', action="store", metavar='USER-AGENT', dest='uagent', 
                        type=str, help="Set the User-Agent request header.")
    parser.add_argument('-x', '--proxy', action="store", metavar='PROXY', dest='proxy', 
                        default=None, type=checkProxyUrl, help="Setup proxy server (eg. '-x http://127.0.0.1:8080')")
    parser.add_argument('--noprint', action='store_true', default=False, 
                        help='EmailHarvester will print discovered emails to terminal. It is possible to tell EmailHarvester not to print results to terminal with this option.')
    parser.add_argument('-r', '--exclude', action="store", metavar='EXCLUDED_PLUGINS', dest="exclude",
                        type=str, default=None, help="Plugins to exclude when you choose 'all' for search engine (eg. '-r google,twitter')")
    parser.add_argument('-p', '--list-plugins', action='store_true', dest='listplugins', 
                        default=False, help='List all available plugins.')
    
    if len(sys.argv) is 1:
        parser.print_help()
        sys.exit()

    args = parser.parse_args()
    
    if args.listplugins:
        path = "plugins/"
        print(green("[+] Available plugins"))
        sys.path.insert(0, path)
        for f in os.listdir(path):
            fname, ext = os.path.splitext(f)
            if ext == '.py':
                print(green("[+] Plugin: ") + cyan(fname))
        sys.exit(1)
        
    if not args.domain:
        print(red("[-] Please specify a domain name to search."))
        sys.exit(2)
    domain = args.domain

    userAgent = (args.uagent or
                 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1")
    
    print(green("[+] User-Agent in use: ") + cyan(userAgent))
    
    if args.proxy:
        print(green("[+] Proxy server in use: ") + cyan(args.proxy.scheme + "://" + args.proxy.netloc))

    filename = args.filename or ""
    limit = args.limit        
    engine = args.engine
    app = EmailHarvester(userAgent, args.proxy)
    plugins = app.get_plugins()

    all_emails = []
    excluded = []
    if args.exclude:
        excluded = args.exclude.split(',')
    if engine == "all":
        print(green("[+] Searching everywhere"))
        for search_engine in plugins:
            if search_engine not in excluded:
                all_emails += plugins[search_engine]['search'](domain, limit)
    elif engine not in plugins:
        print(red("[-] Search engine plugin not found"))
        sys.exit(3)
    else:
        all_emails = plugins[engine]['search'](domain, limit)
    all_emails = unique(all_emails)
    
    if not all_emails:
        print(red("[-] No emails found"))
        sys.exit(4)

    print(green("[+] Emails found: ") + cyan(len(all_emails)))

    if not args.noprint:
        for emails in all_emails:
            print(emails)
            
    if filename:
        try:
            print(green("[+] Saving results to files"))
            with open(filename, 'w') as out_file:
                for email in all_emails:
                    try:
                        out_file.write(email + "\n")
                    except:
                        print(red("[-] Exception: " + email))
        except Exception as e:
            print(red("[-] Error saving TXT file: " + e))
            
        try:
            filename = filename.split(".")[0] + ".xml"
            with open(filename, 'w') as out_file:
                out_file.write('<?xml version="1.0" encoding="UTF-8"?><EmailHarvester>')
                for email in all_emails:
                    out_file.write('<email>{}</email>'.format(email))
                out_file.write('</EmailHarvester>')
            print(green("[+] Files saved"))
        except Exception as er:
            print(red("[-] Error saving XML file: " + er))