-
Notifications
You must be signed in to change notification settings - Fork 25
/
subredditarchive.py
152 lines (150 loc) · 6.4 KB
/
subredditarchive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# -*- coding: utf-8 -*-
import time
import datetime
import praw
import os
import traceback
import requests
print " __ __ _ _ _ _ "
print " the / / / /| | (_) | | (_) "
print " / / __ / /_| | ___ __ _ ___ ___ ___ _ _ __ __ _ _ __ ___| |__ ___ _____ _ __ "
print " / / '__/ / _` |/ \ / _` |/ _ \/ __/ _ \| | '_ \ / _` | '__/ __| '_ \| \ \ / / _ \ '__|"
print " / /| | / / (_| | (_) | (_| | __/ (_| (_) | | | | | | (_| | | | (__| | | | |\ V / __/ | "
print " /_/ |_|/_/ \__,_|\___/ \__, |\___|\___\___/|_|_| |_| \__,_|_| \___|_| |_|_| \_/ \___|_| project"
print " __/ | "
print " |___/ code-name: chugger "
print "Code written by: /u/peoplma and /u/healdb"
print "Sillines added by: /u/joshtheimpaler"
print "Wow by: /r/dogecoin\n\n"
b = "timestamp:"
d = ".."
#Config Details-
r = praw.Reddit('searchandarchive by ')
def resume():
if os.path.exists('config.txt'):
line = file('config.txt').read()
startStamp,endStamp,step,subName=line.split(',')
startStamp,endStamp,step=int(startStamp),int(endStamp),int(step)
return startStamp,endStamp,step,subName
else:
return 0
choice = input('\nMENU\nPlease choose one of the following:\n1. Start New Archive\n2. Continue Archiving\n3. Exit\n(Input the number)\n')
if(choice==1):
subName=raw_input('Input the subreddit to archive: ')
sdate=raw_input('Input start date in the format dd/mm/yyyy: ')
startStamp= int(time.mktime(datetime.datetime.strptime(sdate, "%d/%m/%Y").timetuple()))
edate=raw_input('Input end date in the format dd/mm/yyyy: ')
endStamp= int(time.mktime(datetime.datetime.strptime(edate, "%d/%m/%Y").timetuple()))
step=input('Input seconds between each search, 30 recommended: ')
obj=file('config.txt','w')
obj.write(str(startStamp)+','+str(endStamp)+','+str(step)+','+str(subName))
obj.close()
elif(choice==2):
try:
startStamp,endStamp,step,subName=resume()
except:
print('Nothing to continue.')
exit()
else:
exit()
sdate=datetime.datetime.fromtimestamp(int(startStamp)).strftime('%d-%m-%Y')
edate=datetime.datetime.fromtimestamp(int(endStamp)).strftime('%d-%m-%Y')
folderName=str(subName+' '+str(sdate)+' '+str(edate))
if not os.path.exists(folderName):
os.makedirs(folderName)
def getNew(subName,folderName):
subreddit_comment = r.get_comments(subName, limit=1000)
subreddit_posts = r.get_submissions(subName, limit=1000)
for comment in subreddit_comment:
print comment
url= "https://reddit.com" + comment.permalink
data= {'user-agent':'archive by /u/healdb'}
#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
response = requests.get(url+'.json',headers=data)
#Create a folder called dogecoinArchive before running the script
filename=folderName+"/"+comment.name
obj=open(filename, 'w')
obj.write(response.text)
obj.close()
#print post_json
for post in subreddit_posts:
print post
url1= "https://reddit.com" + post.permalink
#pprint(vars(post))
data= {'user-agent':'archive by /u/healdb'}
#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
if submission.id not in already_done:
response = requests.get(url1+'.json',headers=data)
#Create a folder called dogecoinArchive before running the script
filename=folderName+"/"+post.name
obj=open(filename, 'w')
obj.write(response.text)
obj.close()
#print post_json
already_done.add(submission.id)
else:
continue
def main(startStamp,endStamp,step,folderName,subName,progress):
count=step
try:
startStamp =open(folderName+"/lastTimestamp.txt").read()
print("Resuming from timestamp: " + startStamp)
time.sleep(3)
startStamp=int(startStamp)
progress=startStamp
except:
pass
c=1
for currentStamp in range(startStamp,endStamp,step):
e=' --'
if(c%2==0):
e=' |'
f = str(currentStamp)
g = str(currentStamp+step)
search_results = r.subreddit(subName).search(b+f+d+g, syntax='cloudsearch')
end=str((int((float(count)/float(progress)*20.0))*10)/2)+'%'
print(('\n'*1000)+'Archiving posts and comments...\n['+'*'*int((float(count)/float(progress)*20.0))+'_'*(20-int(float(count)/float(progress)*20.0))+']'+end+e)
count+=step
for post in search_results:
#print("---I found a post! It\'s called:" + str(post))
url= "https://reddit.com" + (post.permalink).replace('?ref=search_posts','')
#pprint(vars(post))
data= {'user-agent':'archive by /u/healdb'}
#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
response = requests.get(url+'.json',headers=data)
#Create a folder called dogecoinArchive before running the script
filename=folderName+"/"+post.name+'.json'
obj=open(filename, 'w')
obj.write(response.text)
obj.close()
#print post_json
#print("I saved the post and named it " + str(post.name) + " .---")
time.sleep(1)
obj=open(folderName+"/lastTimestamp.txt", 'w')
obj.write(str(currentStamp))
obj.close()
c+=1
print('Welp, all done here! Stopped at timestamp '+ str(currentStamp))
progress = endStamp-startStamp
while True:
try:
main(startStamp,endStamp,step,folderName,subName,progress)
print("Succesfully got all posts within parameters.")
choice=input('You can now either\n1. Exit\n2. Get new posts\n(Input the number)\n')
if(choice==1):
exit()
else:
while True:
getNew(subName,folderName)
except KeyboardInterrupt:
exit()
except SystemExit:
exit()
except:
print("Error in the program! The error was as follows: ")
error = traceback.format_exc()
time.sleep(5)
print(error)
time.sleep(5)
print("Resuming in 5 seconds...")
time.sleep(5)