-
Notifications
You must be signed in to change notification settings - Fork 9
/
cleaner.py
969 lines (901 loc) · 42 KB
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
#!/usr/bin/env python
'''
Facebook Profile Cleaner
Copyright (c) 2015, Chander Ganesan <[email protected]>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Chander Ganesan nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL CHANDER GANESAN BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
from optparse import OptionParser
from textwrap import dedent
import facebook
import getpass
import dateutil.parser as dparser
import datetime
import tzlocal
import requests
import sys
import pytz
import pprint
import time
import traceback
from threading import Timer
from collections import defaultdict
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC
import selenium.webdriver.support.ui as ui
from bs4 import BeautifulSoup
import logging
rootHandler = logging.getLogger()
handler = logging.StreamHandler(sys.stdout)
rootHandler.addHandler(handler)
logger = logging.getLogger(__name__)
class FacebookCleaner(object):
def __init__(self, username, password):
self.login = False
self.username = username
self.password = password
self.printer = pprint.PrettyPrinter(indent=4)
self.nfcount = 0
self.nfcount_cycles = 0
self.deleted = 0
self.delay = 1
@property
def graph(self):
'''
Intialize the graph stuff on the first attempt, or if the token is more
than 3300 seconds old (since I think they expire after ~ 1 hour, and we need
likely no more than 5 minutes to query the API in a single request set.)
'''
if (not getattr(self, '_graph', None)) or self.token_expires < time.time():
token = self.get_api_token()
self.token_expires = time.time() + 3300
self._graph = facebook.GraphAPI(access_token=token)
try:
self.profile = self._graph.get_object('me')
self.id = self.profile['id']
self.name = self.profile['name']
except facebook.GraphAPIError, e:
logger.error(
"Failure to access Graph API with token - error: %s", e)
logger.error(
"Perhaps you need to get a new one here: https://developers.facebook.com/tools/explorer/")
sys.exit(1)
return self._graph
@property
def driver(self):
'''
Load the browser and driver when it's first requested/used, rather than
when the object is initialized.
'''
attempts = 0
while not self.login:
try:
if not getattr(self, '_driver', None):
self._driver = webdriver.Firefox()
self._driver.set_window_size(800, 600)
# or you can use
# Chrome(executable_path="/usr/bin/chromedriver")
self._driver.set_page_load_timeout(10)
self._driver.get("https://www.facebook.com")
assert "Facebook" in self._driver.title
elem = self._driver.find_element_by_id("email")
elem.send_keys(self.username)
elem = self._driver.find_element_by_id("pass")
elem.send_keys(self.password)
elem.send_keys(Keys.RETURN)
self.login = True
time.sleep(5)
except:
attempts += 1
if attempts > 5:
logger.error(
'Login failed - perhaps facebook is slow?!\n')
sys.exit(2)
return self._driver
def graphLookup(self, *args, **kwargs):
try:
return self.graph.get_connections(*args, **kwargs)
except facebook.GraphAPIError, e:
logger.error("Failure to access Graph API: %s", e)
logger.error(
"This might be because your deletes took too long - get a new one and restart this tool?")
logger.error(
"Perhaps you need to get a new one here: https://developers.facebook.com/tools/explorer/")
sys.exit(1)
def __del__(self):
if hasattr(self, '_driver'):
self._driver.close()
# return True if element is visible within 2 seconds, otherwise False
def is_visible(self, elem, timeout=2):
time.sleep(.5)
return True
try:
ui.WebDriverWait(self.driver, timeout).until(
EC.visibility_of(elem))
return True
except TimeoutException:
return False
def navigateHomePage(self):
'''
A simple function to navigate to the users page from the main page.
This goes to the activity log page, and then keeps scrolling down until
the entire activity log has been loaded.
'''
username = self.get_user_id()
logger.info(
"Loading the entire activity log in the browser - this can take awhile!")
url = 'https://www.facebook.com/'.format(username)
xpaths = [("//a[@title='Profile']", True,), ]
result = self.perform_xpaths(url, xpaths)
# Keep scrolling to the bottom until there's nothing left...
current_height = False
height = True
while height != current_height:
current_height = self.driver.execute_script(
'return document.body.scrollHeight;')
self._driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
for i in range(30):
height = self.driver.execute_script(
'return document.body.scrollHeight;')
if height != current_height:
break
time.sleep(.5)
time.sleep(2)
height = self.driver.execute_script(
'return document.body.scrollHeight;')
return result
def navigateActivityLog(self):
'''
A simple function to navigate to the activity log from the main page.
This goes to the activity log page, and then keeps scrolling down until
the entire activity log has been loaded.
'''
logger.info(
"Loading the entire activity log in the browser - this can take awhile!")
url = 'https://www.facebook.com/'
xpaths = [("//*[contains(text(), 'Account Settings')]", True,),
("//div[contains(text(), 'Activity Log')]", True,), ]
result = self.perform_xpaths(url, xpaths)
# Keep scrolling to the bottom until there's nothing left...
current_height = False
height = True
while height != current_height:
current_height = self.driver.execute_script(
'return document.body.scrollHeight;')
self._driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
for i in range(30):
height = self.driver.execute_script(
'return document.body.scrollHeight;')
if height != current_height:
break
time.sleep(.5)
time.sleep(2)
height = self.driver.execute_script(
'return document.body.scrollHeight;')
return result
@staticmethod
def perform_click(driver, elem):
hover = ActionChains(driver).move_to_element(elem).click()
try:
hover.perform()
except Exception, e:
if 'HTMLSpanElement' not in str(e):
logger.debug("Failed click: %s", e)
@staticmethod
def perform_hover(driver, elem):
hover = ActionChains(driver).move_to_element(elem)
try:
hover.perform()
except Exception, e:
if 'HTMLSpanElement' not in str(e):
logger.debug("Failed hover: %s", e)
def perform_xpaths(self, url, xpaths, additional_actions=None):
'''
Perform a set of xpath queries, in this case the
value returned is either a boolean (False) indicating
that the process failed for some reason, or a list of values, with
the list normally containing nothing useful.
Default actions are: click (click on something) and hover (hover on something)
if additional_actions is passed in (a dictionary) the default actions
get augmented by the new ones.
'''
results = []
actions = {'click': self.perform_click,
'hover': self.perform_hover}
if isinstance(additional_actions, (dict,)):
actions.update(additional_actions)
if url:
self.load_page(url)
for xpath_components in xpaths:
if len(xpath_components) == 2:
xpath, required = xpath_components
action = 'click'
elif len(xpath_components) == 3:
xpath, required, action = xpath_components
else:
raise Exception(
'Invalid arguments to perform_xpaths {0}'.format(xpath_components))
# Transform lower-case into translate function as it is not included with
# xpath 1.0 (It's useful for performing case-insensitive matching.)
xpath = re.sub(r"lower-case\((.+?)\),",
r"translate(\1, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),",
xpath)
elem = self.driver.find_elements_by_xpath(xpath)
if elem:
elem = elem[0]
if self.is_visible(elem):
results.append(actions[action](self.driver, elem))
elif required:
logger.debug(
"Failed xpath lookup (%s) for URL %s (aborting)", xpath, url)
return False
time.sleep(self.delay)
self.deleted += 1
if (self.deleted % 10) == 0:
sys.stdout.write('*')
sys.stdout.flush()
return results
def delete_status(self, url):
'''
A simple function to use the Firefox UI to remove a status entry.
'''
xpaths = [("//*[@aria-label='Story options']", True,),
("//*[contains(text(), 'More options')]", False,),
("//span[contains(text(), 'Delete')]", True,),
("//button[contains(text(), 'Delete Post')]", True,), ]
return self.perform_xpaths(url, xpaths)
def delete_photo(self, url):
'''
A simple function to use the Firefox UI to remove a photo.
'''
xpaths = [("//*[contains(text(), 'Delete this photo')]", False,),
("//button[contains(@class, 'Confirm')]", True,), ]
return self.perform_xpaths(url, xpaths)
def unlike_page(self, url):
'''
A simple function to use the Firefox UI to unlike a page that
had been liked. This has the side effect of unfollowing as well.
'''
xpaths = [("//button[contains(@class,'PageLikedButton')]", False, 'hover'),
("//*[contains(text(), 'Unlike')]", True,), ]
return self.perform_xpaths(url, xpaths)
def delete_album(self, url):
'''
A simple function to use the Firefox UI to remove an album.
'''
xpaths = [("//a[contains(@class,'fbPhotoAlbumOptionsGear')]", True),
("//*[contains(text(), 'Delete Album')]", False,),
("//button[contains(@class, 'Confirm')]", True), ]
return self.perform_xpaths(url, xpaths)
def untag_photo(self, url):
'''
A simple function to use the Firefox UI to remove an album. Hover
over the username and the click remove tag to remove the tag.
'''
xpaths = [("//a[contains(@class,'taggee') and contains(text(), '{0}')]".format(self.name), True, 'hover'),
("//a[contains(lower-case(text()), 'remove tag')]", True,),
("//button[contains(@class, 'Confirm')]", False,), ]
return self.perform_xpaths(url, xpaths)
def album_generator(self):
albums = self.graphLookup("me", "albums")
album_list = []
# Get a list of albums. We make the list because sometimes we'll
# delete entire albums as we go along, which might mess up the API
# results.
while True:
for album in albums['data']:
yield album
if not (albums.has_key('paging') and albums['paging'].has_key('next')):
break
albums = requests.get(albums['paging']['next']).json()
def clean_albums(self, max_date, min_date):
deleted_albums = 0
for album in self.album_generator():
album["updated_time"] = dparser.parse(album["updated_time"])
if (album['updated_time'] < max_date and
(not min_date or album['updated_time'] > min_date)):
self.delete_album(album['link'])
deleted_albums += 1
logger.info(
"There were %s album(s) with photos removed", deleted_albums)
def photo_generator(self, max_date, min_date):
'''
A generator that iterates over all the photos and albums to return them
all. The albums (if any) are deleted as it goes along, unless it finds
that the update timestamp makes it ineligible for deletion - in which
case it just recurses through the photos therein
'''
# Note: we could use photos/uploaded to get just ours, but since this is
# used by the clean tagged stuff also, we'll just use it all..
pictures = self.graphLookup("me", "photos")
while True:
for picture in pictures['data']:
yield picture
if not (pictures.has_key('paging') and pictures['paging'].has_key('next')):
break
logger.debug("paging...")
pictures = requests.get(pictures['paging']['next']).json()
def page_likes_generator(self, max_date, min_date):
'''
A generator that iterates over all the page likes to return those pages
that were liked.
'''
likes = self.graphLookup("me", "likes")
page_likes = []
while True:
for page_like in likes['data']:
yield page_like
if not (likes.has_key('paging') and likes['paging'].has_key('next')):
break
likes = requests.get(page_likes['paging']['next']).json()
def purgeActivity(self, max_date, min_date):
'''
Go through the activity log and remove things...
'''
post_purge_items = ('tagged_in', 'tagged_at')
post_purge = []
for item_date, items in self.getOrderedActivity():
if (item_date > max_date or
(min_date and item_date < min_date)):
continue
for item_type, item in items:
if item_type in post_purge_items:
logger.debug(
'Adding item of type %s (on %s) to post_purge list',
item_type, item_date)
post_purge.append(item)
else:
logger.debug(
"Purging item type of %s, on %s", item_type, item_date)
self.purgeElement(item, post_purge_items)
# Now the post_purge_items contains those things that need to be unliked
# or untagged
# print post_purge
def purgeElement(self, item, post_purge=[]):
'''
Locate an edit button of an item, click it, and then perform
the appropriate purge action. This is specifically geared towards
the activity log...
'''
item_bs = BeautifulSoup(item.get_attribute('innerHTML'))
# Locate all the id tags in the parents.
parents = [i.get('id')
for i in item_bs.find_all(lambda tag: tag.has_attr('id'))]
# Build an Xpath query to locate any item with an ownerid that's in the
# parent..
xpath_string = ' | '.join("//*[contains(@data-ownerid, '{0}')]".format(parent_id)
for parent_id in parents)
try:
edit = item.find_elements_by_xpath(
".//a[contains(@role,'button')]")[-1]
except:
return
self.perform_click(self.driver, edit)
time.sleep(1)
elements = self.driver.find_elements_by_xpath(xpath_string)
# We need to delete things from the most effective to least effective
# so we start with delete, if that's not there then unlike, and if
# that's not there then hide from timeline. In each case we loop through
# all the elements to look for the one we need, if we find it that's where we
# stop.
xpath_pairs = [('delete', ".//span[contains(text(), 'Delete')]",),
('unlike', ".//span[contains(text(), 'Unlike')]",),
('unvote',
".//span[contains(text(), 'Unvote')]",),
# ('hidden from timeline',
# ".//span[contains(text(), 'Hidden from Timeline')]",),
]
delete_xpaths = [("//span[contains(lower-case(text()), 'delete')]",
False,),
("//button[contains(lower-case(text()), 'delete post')]",
False,),
("//button[contains(lower-case(text()), 'confirm')]",
False,),
]
for keyword, xpathq in xpath_pairs:
for elem in elements:
elem2 = None
bs = BeautifulSoup(elem.get_attribute('innerHTML'))
if keyword in bs.text.lower():
elem2 = elem.find_elements_by_xpath(xpathq)
if elem2:
time.sleep(1)
self.perform_click(self.driver, elem2[0])
result = self.perform_xpaths(None, delete_xpaths)
return result
def cleanWall(self, max_date, min_date):
'''
Navigate to load the whole home page, then go and clean it up
'''
self.navigateHomePage()
# Find the parent div for stories
xpath_q = "//div[@data-time]"
child_q = ".//a[@aria-label='Story options']"
final_xpaths = [".//li[@role='presentation']//*[contains(text(), 'Remove tag')]",
".//li[@role='presentation']//*[contains(text(), 'Remove Tag')]",
".//li[@role='presentation']//*[contains(text(), 'Delete')]",
".//li[@role='presentation']//*[contains(text(), 'Hide from timeline')]",
".//li[@role='presentation']//*[contains(text(), 'Hide from Timeline')]",
]
delete_xpaths = [("//span[contains(lower-case(text()), 'delete')]",
False,),
("//button[contains(lower-case(text()), 'delete post')]",
False,),
("//button[contains(lower-case(text()), 'confirm')]",
False,),
("//button[contains(lower-case(text()), 'okay')]",
False,),
]
for item in self.driver.find_elements_by_xpath(xpath_q):
try:
item_date = datetime.datetime.fromtimestamp(
int(item.get_attribute('data-time')), pytz.utc)
if (item_date > max_date or
(min_date and item_date < min_date)):
continue
edit = item.find_elements_by_xpath(child_q)
if edit:
self.perform_click(self.driver, edit[0])
time.sleep(1)
elems = [e for xpathq in final_xpaths
for e in self.driver.find_elements_by_xpath(
xpathq) if e.is_displayed()]
for elem in elems:
self.perform_click(self.driver, elem)
time.sleep(3)
result = self.perform_xpaths(
None, delete_xpaths)
break
except Exception, e:
if logger.isEnabledFor(logging.DEBUG):
logger.exception('Failed attempt to delete story')
def getOrderedActivity(self):
self.navigateActivityLog()
bborders = self.driver.find_elements_by_xpath(
"//*[contains(@class,'bottomborder')] | //div[contains(@class, '_iqq')]")
bborders.reverse()
bborders_copy = bborders[:]
item_dates = defaultdict(list)
id_list = []
months = [datetime.date(2015, i, 1).strftime(
'%B').lower() for i in range(1, 13)]
year_re = re.compile('({0})\s+(\d{{4}})'.format('|'.join(months),),
flags=re.IGNORECASE)
this_year = datetime.datetime.now().date().strftime('%Y')
# Get tody and yesterday.
today = datetime.datetime.now().date().strftime('%B %d')
yesterday = (
datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%B %d')
all_items = []
for pos, item in enumerate(bborders):
innerdata = item.get_attribute('innerHTML')
soup = BeautifulSoup(innerdata)
text = soup.text.encode('ascii', 'ignore').decode('ascii')
year_match = year_re.match(text)
skip = False
# We found year, so save that set of stuff.
if year_match:
logger.debug(u"{0}: Got Year {1}".format(pos,
year_match.group(2)))
if item_dates:
for mon_day, day_items in item_dates.iteritems():
item_date = dparser.parse('{0}, {1}'.format(
mon_day, year_match.group(2))).replace(tzinfo=tzlocal.get_localzone())
# all_items.append((item_date, day_items,))
# Now yield this set of stuff as a set of objects that includes
# the actual date of the post
logger.debug(
'Returning %s items for %s', len(day_items), item_date)
yield item_date, day_items
else:
logger.debug("No item dates??!")
item_dates = defaultdict(list)
# A year entry is always followed by a date entry, so there
# really should be no entries in id_list when we find a year..
if len(id_list) != 0:
logger.debug("WTF! Id_list contains %s", id_list)
id_list = []
continue
else:
for m in months:
if innerdata.lower().startswith(m):
logger.debug("Got Date %s", innerdata)
if innerdata.lower() == 'today':
innerdata = today
if innerdata.lower() == 'yesterday':
innerdata = yesterday
if id_list:
# innerdata is a month and day
item_dates[innerdata].extend(id_list)
id_list = []
skip = True
break
if skip:
continue
if text.startswith('You commented on'):
act_type = 'comment'
elif text.startswith('You were mentioned'):
act_type = 'mentioned'
elif 'updated' in text and 'status' in text:
logger.debug('Ignoring status update')
continue
elif 'added a new photo' in text:
logger.debug('Ignoring added photo')
continue
elif 'wrote on yourtimeline' in text:
act_type = 'wrote_on_me'
elif 'wrote on' in text:
act_type = 'wrote_on'
elif 'shared alink' in text or 'shared a link' in text:
act_type = 'shared_link'
elif 'likes alink' in text or 'likes a link' in text:
act_type = 'liked_link'
elif 'tagged in' in text:
act_type = 'tagged_in'
elif 'tagged at' in text:
act_type = 'tagged_at'
elif 'became friends' in text:
act_type = 'friend'
elif 'Happy Birthday' in text:
act_type = 'birthday'
elif 'friend request' in text:
act_type = 'friend_request'
elif 'worked on' in text:
act_type = 'worked_on'
elif text.startswith('You like'):
act_type = 'like'
elif 'posted in' in text:
act_type = 'posted_in'
elif 'replied to your' in text and 'comment' in text:
act_type = 'replied_commend'
else:
act_type = 'unknown'
logger.debug('UNKNOWN: {0}'.format(text))
id_list.append((act_type, item,))
return
def clean_page_likes(self, max_date, min_date=None):
'''
Use the page_likes generator to get all the pages a user has liked,
and then unlike them (based on the date range selected.)
'''
page_likes = []
for page_like in self.page_likes_generator(max_date, min_date):
page_like["created_time"] = dparser.parse(
page_like["created_time"])
if (page_like['created_time'] < max_date and
(not min_date or page_like['created_time'] > min_date)):
page_likes.append(page_like)
if (len(page_likes) % 10) == 0:
sys.stdout.write('L')
sys.stdout.flush()
logger.info(
"There are {0} page's to be unliked".format(len(page_likes)))
for page_like in page_likes:
url = 'https://facebook.com/{0}'.format(page_like['id'])
self.unlike_page(url)
def clean_tagged_photos(self, max_date, min_date=None):
'''
Use the photos generator to clean all photos that a user has been
tagged in, including those that the user might own him/herself
'''
tagged_photos = []
for tagged_photo in self.photo_generator(max_date, min_date):
tagged_photo["created_time"] = dparser.parse(
tagged_photo["created_time"])
if tagged_photo['from']['id'] != self.id: # Someone else's photo
tagged_photos.append(tagged_photo)
# Our photo where we are tagged in it..
elif 'tags' in tagged_photo and 'data' in tagged_photo['tags']:
for elem in tagged_photo['tags']['data']:
if elem['from']['id'] == self.id:
tagged_photos.append(tagged_photo)
break
if (len(tagged_photos) % 10) == 0:
sys.stdout.write('T')
sys.stdout.flush()
logger.info(
"There are {0} photos's to be untagged".format(len(tagged_photos)))
for tagged_photo in tagged_photos:
self.untag_photo(tagged_photo['link'])
def clean_photos(self, max_date, min_date=None):
'''
Use the photo generator to get a list of all photos in all albums,
and then delete them. In this case the albums actually will get
deleted by the generator (assuming they have been last updated
before the delete range.) This is much better, since deleting an
album deletes all the photos inside it (as opposed to having to
delete each one.)
'''
pictures = []
picture_types = set()
self.clean_albums(max_date, min_date)
for picture in self.photo_generator(max_date, min_date):
picture["created_time"] = dparser.parse(picture["created_time"])
if (picture['created_time'] < max_date and
(not min_date or picture['created_time'] > min_date)):
if picture['from']['id'] != self.id:
continue
pictures.append(picture)
if (len(pictures) % 10) == 0:
sys.stdout.write('.')
sys.stdout.flush()
logger.info(
"There are {0} pictures to be deleted".format(len(pictures)))
for picture in pictures:
if 'link' in picture:
url = picture['link']
else:
continue
self.delete_photo(url)
def get_api_token(self):
main_window_handle = self.driver.window_handles[0]
delay = self.delay
self.delay = 5
url = 'https://developers.facebook.com/tools/explorer/'
xpaths = [("//span[contains(text(), 'Get Token')]", True),
("//span[contains(text(), 'Get Access Token')]", True),
("//a[contains(text(), 'Clear')]", True,),
("//input[@name='user_status']", False,),
("//input[@name='user_relationship']", False,),
("//input[@name='user_photos']", False,),
("//input[@name='user_videos']", False,),
("//input[@name='user_interests']", False,),
("//input[@name='user_friends']", False,),
("//input[@name='user_events']", False,),
("//input[@name='user_likes']", False,),
("//*[@data-group='extended']", True,),
("//input[@name='read_stream']", False,),
("//button[contains(text(), 'Get Access Token')]", True,)
]
self.perform_xpaths(url, xpaths)
time.sleep(3)
if len(self.driver.window_handles) > 1:
for handle in self.driver.window_handles:
try:
self.driver.switch_to_window(handle)
except:
continue
if 'Log in' in self.driver.title:
xpaths = [("//button[contains(text(), 'Okay')]", True,)]
self.perform_xpaths(None, xpaths)
self.driver.switch_to_window(main_window_handle)
elem = self.driver.find_elements_by_xpath(
"//span[contains(text(), 'Access Token')]/following::input[1]")[0]
token = elem.get_attribute("value")
self.delay = delay
return token
def get_user_id(self):
'''
Get the user_id of the Facebook user, the pretty one the user selected
if it exists, otherwise the numerical ID.
'''
additional_actions = {
'copy': lambda driver, elem: elem.get_attribute("href")}
xpaths = [("//a[@class='fbxWelcomeBoxName']", True, 'copy')]
user_id = self.perform_xpaths("https://www.facebook.com", xpaths,
additional_actions)
if user_id:
return user_id[0].split("/")[3]
def clean_posts(self, max_date, min_date=None):
'''
Iterate over the posts for an account and delete them if possible,
note that many posts aren't deleteable for various reasons, so in those
cases you'll just get a link to the post and an error message (which you'll
need to deal with on your own.)
'''
feed = self.graphLookup("me", "feed") # requires read_stream
posts = []
post_types = set()
# Get all the posts via the graph API
while True:
# Perform some action on each post in the collection we receive from
# Facebook.
for post in feed['data']:
# Attempt to make a request to the next page of data, if it
# exists.
post["created_time"] = dparser.parse(post["created_time"])
post_types.add(post['type'])
if (post['created_time'] < max_date and
(not min_date or post['created_time'] > min_date)):
if post['from']['id'] != self.id:
continue
if post['type'] not in ('status', 'link', 'photo', 'video',):
continue
if 'are now friends.' in post.get('story', ''):
# This is a new friend added post.
continue
posts.append(post)
if (len(posts) % 10) == 0:
sys.stdout.write('.')
sys.stdout.flush()
if not (feed.has_key('paging') and feed['paging'].has_key('next')):
break
feed = requests.get(feed['paging']['next']).json()
logger.info("Found {0} posts to be deleted".format(len(posts)))
user_id = self.get_user_id()
for post in posts:
if 'link' not in post.get('actions', [{}])[0]:
continue
url = post['actions'][0]['link']
# Some users have "pretty" user IDs and Facebook seems to prefer their
# use to numerical user IDs in post URLs
if user_id:
url = re.sub(r"/([0-9]+)/posts", "/%s/posts" % user_id, url)
if post['type'] in ('link', 'status', 'photo', 'video'):
self.delete_status(url)
time.sleep(5)
def load_page(self, url):
count = 0
while count < 5:
try:
self.driver.get(url)
time.sleep(5)
if "Page Not Found" in self.driver.title:
self.nfcount += 1
if self.nfcount < 10:
time.sleep(2)
continue
else:
logger.info(
"Too many failed requests, sleeping for 2 hours")
time.sleep(60 * 60 * 2)
self.nfcount_cycles += 1
self.nfcount = 0
if self.nfcount_cycles > 10:
print "Exiting - too many failures"
sys.exit(0)
continue
break
except:
time.sleep(3)
count += 1
else:
logger.info("Failed to load {0}".format(url))
continue
if __name__ == '__main__':
description = '''
A tool to (permanently?) remove items from a users facebook history.
This script uses the Facebook Graph API to retrieve data for a user
account, and then remove each item that falls within a provided time range.
Originally, this was developed to allow a fast and easy way to purge
Facebook history from your account - it's especially useful when you have
past data that is visible to people that you don't want/need it to be
visible to (such as all friends except those within a group.) Though
it could be argued that it's also easy to modify this script to simply
change the permissions on all past posts recursively.
'''
parser = OptionParser(description=dedent(description).strip())
parser.add_option("-s", "--min-date", dest="min_date",
help="The earliest at which to start deleting items (start date)",
default=None)
parser.add_option("-e", "--max_date",
dest="max_date", default=None,
help="The date of the most recent item to delete (inclusive) (end_date)")
parser.add_option("-u", "--username",
dest="username", default=None,
help="Your facebook username")
parser.add_option("-p", "--password",
dest="password", default=None,
help="Your facebook password")
parser.add_option("--photos",
action='store_true',
dest="clean_photos", default=False,
help="Remove Photos")
parser.add_option("--untag-photos",
action='store_true',
dest="clean_tagged_photos", default=False,
help="Untag Photos that a user is tagged in")
parser.add_option("--posts",
action='store_true',
dest="clean_posts", default=False,
help="Remove Posts")
parser.add_option("--purge-activity",
action='store_true',
dest="purge_activity", default=False,
help="Purge (almost) everything, including others comments, tagged photos, unliking, etc. using the activity log (do this last!)")
parser.add_option("--clean-wall",
action='store_true',
dest="clean_wall", default=False,
help="Delete stuff from the wall.")
parser.add_option("--page-likes",
action='store_true',
dest="clean_page_likes", default=False,
help="Unlike any liked pages")
parser.add_option("--debug",
action='store_true',
dest="debug", default=False,
help="Enable debug output.")
(options, args) = parser.parse_args()
required_arguments = ['max_date', 'username', ]
if options.debug:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
for arg in required_arguments:
missing_args = []
if getattr(options, arg, None) is None:
missing_args.append(arg)
if missing_args:
print "Missing argument(s) for {0}".format(', '.join(missing_args))
parser.print_help()
exit(0)
if not max(options.clean_posts, options.clean_photos, options.clean_page_likes,
options.clean_tagged_photos, options.purge_activity, options.clean_wall):
print ("Must specify at least one action (--photos, --posts, " +
"--untag-photos, --page-likes, --clean-wall, --purge-activity)!")
parser.print_help()
exit(0)
while not options.password:
options.password = getpass.getpass(
'Enter password for {0}: '.format(options.username))
for f in ['max_date', 'min_date']:
if getattr(options, f):
setattr(options, f, dparser.parse(getattr(options, f)).replace(
tzinfo=tzlocal.get_localzone()))
fbc = FacebookCleaner(username=options.username, password=options.password)
print dedent('''
Sometimes the browser page fails to load, and things get stuck!
To fix this, there are a couple things you can do:
1. If the FF window opens but no webpage is loaded you might need to upgrade
Selenium. Update the version in requirements.txt and re-install the depdendencies.
2. After the FF window opens, make it narrower - so
that the ads and messenger are not visible in your browser window.
3. If you notice it being "stuck" (i.e. the page is loading for a long time) press the
browser's "stop" button, then wait a few seconds and things should continue
normally.
Sorry, but unfortunately Selenium doesn't have a component that lets the script hit "stop",
so it's a manual thing.
Note: If you close the browser window, things will likely stop working.
Leave it open and watch the magic!
DANGER DANGER DANGER DANGER DANGER DANGER DANGER DANGER DANGER DANGER
YOUR FACEBOOK DATA COULD BE DELETED - PRESS CONTROL-C TO ABORT THIS
PROCESS NOW, IF YOU DON'T WANT THAT TO HAPPEN!!!!
DANGER DANGER DANGER DANGER DANGER DANGER DANGER DANGER DANGER DANGER
''')
answer = raw_input(
'This tool could remove portions of, or all of, your facebook account - are you sure you wish to continue (yes/N)? ')
if answer.lower().strip() != 'yes':
print "Please enter 'yes' to run this!"
sys.exit(3)
if options.clean_posts:
fbc.clean_posts(max_date=options.max_date,
min_date=options.min_date)
if options.clean_photos:
fbc.clean_photos(max_date=options.max_date,
min_date=options.min_date)
if options.clean_tagged_photos:
fbc.clean_tagged_photos(max_date=options.max_date,
min_date=options.min_date)
if options.clean_page_likes:
fbc.clean_page_likes(max_date=options.max_date,
min_date=options.min_date)
if options.purge_activity:
fbc.purgeActivity(max_date=options.max_date, min_date=options.min_date)
if options.clean_wall:
fbc.cleanWall(max_date=options.max_date, min_date=options.min_date)