Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
afeena committed Jan 30, 2018
2 parents 38789a8 + db7c35b commit 8cc9cb1
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 152 deletions.
174 changes: 114 additions & 60 deletions clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,27 @@
GNU General Public License for more details.
"""

import re
import argparse
import asyncio
import hashlib
import json
import os
import re
import sys

import asyncio
from asyncio import Queue
import argparse

import aiohttp
import cssutils
import yarl
from bs4 import BeautifulSoup


class Cloner(object):
def __init__(self, root):
def __init__(self, root, max_depth):
self.visited_urls = []
self.root = self.add_scheme(root)
self.max_depth = max_depth
self.moved_root = None
if len(self.root.host) < 4:
sys.exit('invalid taget {}'.format(self.root.host))
self.target_path = '/opt/snare/pages/{}'.format(self.root.host)
Expand All @@ -39,6 +43,7 @@ def __init__(self, root):
os.mkdir(self.target_path)

self.new_urls = Queue()
self.meta = {}

@staticmethod
def add_scheme(url):
Expand All @@ -48,103 +53,147 @@ def add_scheme(url):
new_url = yarl.URL('http://' + url)
return new_url

@asyncio.coroutine
def process_link(self, url, check_host=False):
url = yarl.URL(url)
async def process_link(self, url, level, check_host=False):
try:
url = yarl.URL(url)
except UnicodeError:
return None
if url.scheme == ("data" or "javascript" or "file"):
return url.human_repr()
if not url.is_absolute():
if self.moved_root is None:
url = self.root.join(url)
else:
url = self.moved_root.join(url)

host = url.host

if check_host:
if (url.host != self.root.host or url.fragment
or url in self.visited_urls):
if (host != self.root.host and self.moved_root is None) or \
url.fragment or \
(self.moved_root is not None and host != self.moved_root.host):
return None
if not url.is_absolute():
url = self.root.join(url)

yield from self.new_urls.put(url)
return url.relative().human_repr()
if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth:
await self.new_urls.put((url, level + 1))

res = None
try:
res = url.relative().human_repr()
except ValueError:
print(url)
return res

@asyncio.coroutine
def replace_links(self, data):
async def replace_links(self, data, level):
soup = BeautifulSoup(data, 'html.parser')

# find all relative links
for link in soup.findAll(href=True):
res = yield from self.process_link(link['href'], check_host=True)
res = await self.process_link(link['href'], level, check_host=True)
if res is not None:
link['href'] = res

# find all images and scripts
for elem in soup.findAll(src=True):
res = yield from self.process_link(elem['src'])
res = await self.process_link(elem['src'], level)
if res is not None:
elem['src'] = res

# find all action elements
for act_link in soup.findAll(action=True):
res = yield from self.process_link(act_link['action'])
res = await self.process_link(act_link['action'], level)
if res is not None:
act_link['action'] = res

# prevent redirects
for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
redir['value'] = yarl.URL(redir['value']).relative().human_repr()
if redir['value'] != "":
redir['value'] = yarl.URL(redir['value']).relative().human_repr()

return soup

@asyncio.coroutine
def get_body(self):
while not self.new_urls.empty():
current_url = yield from self.new_urls.get()
if current_url in self.visited_urls:
continue
self.visited_urls.append(current_url)
if current_url.name:
file_name = current_url.name
elif current_url.raw_path != '/':
file_name = current_url.path.rsplit('/')[1]
else:
file_name = 'index.html'
file_path = os.path.dirname(current_url.path)
if file_path == '/':
file_path = self.target_path
else:
file_path = os.path.join(self.target_path, file_path[1:])
def _make_filename(self, url):
host = url.host
if url.is_absolute():
file_name = url.relative().human_repr()
else:
file_name = url.human_repr()
if not file_name.startswith('/'):
file_name = "/" + file_name

print('path: ', file_path, 'name: ', file_name)
if file_name == '/' or file_name == "":
if host == self.root.host or (self.moved_root is not None and self.moved_root.host == host):
file_name = '/index.html'
else:
file_name = host
m = hashlib.md5()
m.update(file_name.encode('utf-8'))
hash_name = m.hexdigest()
return file_name, hash_name

if file_path and not os.path.exists(file_path):
os.makedirs(file_path)
async def get_body(self, session):
while not self.new_urls.empty():
current_url, level = await self.new_urls.get()
if current_url.human_repr() in self.visited_urls:
continue
self.visited_urls.append(current_url.human_repr())
file_name, hash_name = self._make_filename(current_url)
print('name: ', file_name)
self.meta[file_name] = {}

data = None
content_type = None
try:
with aiohttp.Timeout(10.0):
with aiohttp.ClientSession() as session:
response = yield from session.get(current_url)
data = yield from response.read()
except aiohttp.ClientError as client_error:
response = await session.get(current_url)
content_type = response.content_type
data = await response.read()

except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
print(client_error)
else:
response.release()
session.close()
await response.release()
if data is not None:
if re.match(re.compile('.*\.(html|php)'), file_name):
soup = yield from self.replace_links(data)
self.meta[file_name]['hash'] = hash_name
self.meta[file_name]['content_type'] = content_type
if content_type == 'text/html':
soup = await self.replace_links(data, level)
data = str(soup).encode()
with open(os.path.join(file_path, file_name), 'wb') as index_fh:
with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh:
index_fh.write(data)
if '.css' in file_name:
if content_type == 'text/css':
css = cssutils.parseString(data)
for carved_url in cssutils.getUrls(css):
if carved_url.startswith('data'):
continue
carved_url = yarl.URL(carved_url)
if not carved_url.is_absolute():
carved_url = self.root.join(carved_url)
if carved_url not in self.visited_urls:
yield from self.new_urls.put(carved_url)

@asyncio.coroutine
def run(self):
yield from self.new_urls.put(self.root)
return (yield from self.get_body())
if carved_url.human_repr() not in self.visited_urls:
await self.new_urls.put((carved_url,level+1))

async def get_root_host(self):
try:
with aiohttp.ClientSession() as session:
resp = await session.get(self.root)
if resp._url_obj.host != self.root.host:
self.moved_root = resp._url_obj
resp.close()
except aiohttp.errors.ClientError as err:
print("Can\'t connect to target host.")
exit(-1)

async def run(self):
session = aiohttp.ClientSession()
try:
await self.new_urls.put((self.root, 0))
await self.get_body(session)
except KeyboardInterrupt:
raise
finally:
with open(os.path.join(self.target_path, 'meta.json'), 'w') as mj:
json.dump(self.meta, mj)
await session.close()


def main():
Expand All @@ -158,9 +207,14 @@ def main():
loop = asyncio.get_event_loop()
parser = argparse.ArgumentParser()
parser.add_argument("--target", help="domain of the page to be cloned", required=True)
parser.add_argument("--max-depth", help="max depth of the cloning", required=False, default=sys.maxsize)
args = parser.parse_args()
cloner = Cloner(args.target)
loop.run_until_complete(cloner.run())
try:
cloner = Cloner(args.target, int(args.max_depth))
loop.run_until_complete(cloner.get_root_host())
loop.run_until_complete(cloner.run())
except KeyboardInterrupt:
pass


if __name__ == '__main__':
Expand Down
31 changes: 31 additions & 0 deletions converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import hashlib
from os import walk
import mimetypes
import json
import shutil


class Converter:
def __init__(self):
self.meta = {}

def convert(self, path):
files_to_convert = []

for (dirpath, dirnames, filenames) in walk(path):
for fn in filenames:
files_to_convert.append(os.path.join(dirpath, fn))

for fn in files_to_convert:
path_len = len(path)
file_name = fn[path_len:]
m = hashlib.md5()
m.update(fn.encode('utf-8'))
hash_name = m.hexdigest()
self.meta[file_name] = {'hash': hash_name, 'content_type': mimetypes.guess_type(file_name)[0]}
shutil.copyfile(fn, os.path.join(path, hash_name))
os.remove(fn)

with open(os.path.join(path, 'meta.json'), 'w') as mj:
json.dump(self.meta, mj)
Loading

0 comments on commit 8cc9cb1

Please sign in to comment.