Ziw xxx

://[^/]+)", str(link)) if m: return m.groups()[0] return '' ''' recursive algorithm to scan website and it's plain HTML-contents ''' def __scan_page(self, link): # reject invalid URLs if not self.is_url(link): return # reject already processed links if link in self.__links or self.__unique_urls.count(link) = self.__max_http_get_urls): return # reject recursive repetitions if re.search("\..+/", urlparse.urlparse(link)[2]): return # try to open the URL try: url = urllib2.urlopen(link) except: return # reject broken links and links that can't be accessed without errors if not url or !

Ziw xxx-61

In some situations this is inevitable, like when you choose to encrypt your entire existing operating system including your root mountpoint.

You might want to backup your plain/unencrypted harddisk, encrypt it and restore its contents to the encrypted mountpoint.

It’s not advisable to backup a running operating system, unless you know exactly what you’re doing.

A consistent backup can be created in Single User Mode.

backup script: &2 && exit 1 [ -d ${dir}/ ] || { echo "$dir is not a directory"; exit 1; } dir=${dir%/} echo "Starting $(hostname -f)'s backup at $(date) ..." time tar --exclude=/dev/* \ --exclude=/lost+found/* \ --exclude=/media/* \ --exclude=/mnt/* \ --exclude=/proc/* \ --exclude=/var/cache/apt/archives/* \ --exclude=/sys/* \ --exclude=/tmp/* \ --exclude=/usr/ports/* \ --exclude=/${dir}/* \ -cpf - / | gzip --best #!

/usr/bin/python ''' from - programming a Web-Spider ''' import os import re import sys import urllib2 import urlparse ''' class Unique URLS: handles unique URLs ''' class Unique URLs: __urls = {} # URL-hash Key = (Protocol, Link, (GET-parameter names)), Value = amount) __max_kind = 0 # max.

number of unique links def __init__(self, max_kind): n = max(max_kind, 1) self.__max_kind = n ''' transforms URL to unique link ''' def __transform(self, url): t = urlparse.urlparse(url) # get GET-variables from PHP URL q = re.findall("[? =;]+)", url) # create unique hash from protocol, domain, url and GET variable names return (str(t[0]) + "://" + str(t[1]) + str(t[2]), tuple(q)) ''' add URL ''' def add(self, url): e = self.__transform(url) if not e in self.__urls: self.__urls[e] = 1 else: if self.__urls[e] == self.__max_kind: return False self.__urls[e] += 1 return True ''' count URLs of the same kind or all URLs ''' def count(self, url=''): if not url: return len(self.__urls) e = self.__transform(url) if not e in self.__urls: return 0 return self.__urls[e] ''' class Website: recursively scans websites and saves HTTP GET-parameter URLs ''' class Website: __domain = '' # website's domain name __links = set() # accepted URLs __http_get_urls = set() # HTTP GET-URLs __skipped = set() # skipped URLs __max_urls = 40000 # max URLs to evaluate __follow_links = 20 # max.

number of similar links to follow (see class Unique URLs) __max_url_length = 256 # max allowed length of URL __max_http_get_urls = 300 # max amount of HTTP GET-URLS to collect ''' Interface to class Unique URLs ''' __unique_urls = Unique URLs(__follow_links) ''' constructor accessing the website's URLs ''' def __init__(self, url): http = " https = "https://" c = 0 fn = '' # reject invalid URLs if not str(url).startswith(http) and not str(url).startswith(https): url = http + url if not self.is_url(url): sys.stderr.write("not a valid URL: " + url + "\n") return # get domain name self.__domain = self.get_domain(url) print "domain:", self.__domain # check URL print "evaluating HTTP-code to URL '" + url + "' ...

", c = self.get_http_code(url) if c == 200: print c, "O.

K." else: if c == None: sys.stderr.write("Wrong URL\n") else: sys.stderr.write("error code: " + c + "\n") return # scan URL self.__scan_page(url) # save content to files fn = str(urlparse.urlparse(url)[1]) self.__save_content(fn + ".urls", self.__links) self.__save_content(fn + ".skipped", self.__skipped) self.__save_content(fn + ".get", self.__http_get_urls) ''' returns HTTP-code of specific URL ''' def get_http_code(self, link): if not self.is_url(str(link)): return None try: url = urllib2.urlopen(link) except: return None return ''' basic URL syntax check''' def is_url(self, link): return bool(re.search("^http[s]?

://[^ ]+\.[^ ]+$", str(link))) ''' gets the domain name from valid URL ''' def get_domain(self, link): m = re.search("(^http[s]?

Tags: , ,