1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
#--------------------------------------------------------------------------------
#monitor_website_links.py v1.1, Copyright Bjoern Olausson
#--------------------------------------------------------------------------------
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#To view the license visit
#http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
#or write to
#Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#--------------------------------------------------------------------------------
#--------------------------------------------------------------------------------
import time, random, mechanize, sys, smtplib, os, re
from datetime import timedelta, datetime
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email.MIMEText import MIMEText
from email import Encoders
# URL to monitor
URL = "http://www.example.com/"
# Compile a regular expression to search for in the link text
SEARCH_FOR_REGEXP = re.compile("RFC", re.I)
# Remove something e.g. session ID from the URL with this regex
REMOVE_FROM_URL = re.compile("s=.*&")
# Run the script for X days
MONITOR_FOR_DAYS = 30
# How long to sleep between checks (in Seconds)
SLEEP_SECONDS_BETWEEN_CHECKS = 60*30
# Set to False if you do not want to store the results in a file
# Otherwise enter a path to a file e.g. "/home/USER/search_results.txt"
SAVE_TO = False
# Set to False if you do not want to mail the results
# Otherwise enter your mailaddress e.g. "
This e-mail address is being protected from spambots. You need JavaScript enabled to view it
"
MAIL_TO = False
# GMAIL username and password
gmail_user = ""
gmail_pwd = ""
def mail(text, attach="false"):
'''http://kutuma.blogspot.com/2007/08/sending-emails-via-gmail-with-python.html'''
msg = MIMEMultipart()
msg['From'] = gmail_user
msg['To'] = MAIL_TO
msg['Subject'] = "Found the following for your search"
msg.attach(MIMEText(text))
if attach != "false":
part = MIMEBase('application', 'octet-stream')
part.set_payload(open(attach, 'rb').read())
Encoders.encode_base64(part)
part.add_header('Content-Disposition',
'attachment; filename="%s"' % os.path.basename(attach))
msg.attach(part)
mailServer = smtplib.SMTP("smtp.gmail.com", 587)
mailServer.ehlo()
mailServer.starttls()
mailServer.ehlo()
mailServer.login(gmail_user, gmail_pwd)
mailServer.sendmail(gmail_user, MAIL_TO, msg.as_string())
# Should be mailServer.quit(), but that crashes...
mailServer.close()
if SAVE_TO:
f = open(SAVE_TO, 'w')
base_url = URL[:URL.rfind("/")]
start_time = datetime.now()
stop_time = start_time + timedelta(days=int(MONITOR_FOR_DAYS))
br = mechanize.Browser()
br.set_handle_gzip(False)
br.set_handle_referer(True)
br.set_handle_redirect(True)
br.set_handle_equiv(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
UA_DICT = {
"OPERA": "Opera/9.80 (X11; Linux x86_64; U; Opera Next; en) Presto/2.8.131 Version/11.50 Gentoo",
"KONQUEROR": "Mozilla/5.0 (compatible; Konqueror/4.5; FreeBSD) KHTML/4.5.4 (like Gecko)",
"MICROB": "Mozilla/5.0 (X11; U; Linux armv7l; en-GB; rv:1.9.2a1pre) Gecko/20090514 Firefox/3.0 Tablet browser 0.9.7 RX-34",
}
MESSAGE = ""
while (datetime.now() < stop_time):
UAS = random.choice(list(UA_DICT.keys()))
USER_AGENT_STRING = UA_DICT[UAS]
br.addheaders = [('User-agent', '%(UAS)s' %{"UAS": USER_AGENT_STRING})]
try:
br.open(URL)
response = br.response().read()
except Exception, e:
print e
mail(e)
try:
LINKS_FOUND = br.links(text_regex=SEARCH_FOR_REGEXP)
except Exception, e:
print e
mail(e)
else:
LINKTEXT = ["\n".join([LINK.text, base_url+"/"+REMOVE_FROM_URL.sub("", LINK.url)]) for LINK in LINKS_FOUND]
RESULT = "\n".join(LINKTEXT)
if MESSAGE != RESULT and len(LINKTEXT):
MESSAGE = RESULT
if SAVE_TO:
f.write(str(datetime.now())+"\n"+MESSAGE+"\n\n")
f.flush()
if MAIL_TO:
mail(str(datetime.now())+"\n"+MESSAGE)
print "sleeping:", SLEEP_SECONDS_BETWEEN_CHECKS
time.sleep(float(SLEEP_SECONDS_BETWEEN_CHECKS))
|