mirror of
https://github.com/captn3m0/gsur-eml-to-markdown.git
synced 2024-09-19 12:57:10 +00:00
57 lines
2.4 KiB
Python
57 lines
2.4 KiB
Python
from email.utils import parsedate_to_datetime
|
|
from email.parser import Parser
|
|
import dateutil.relativedelta
|
|
from email import policy
|
|
import pystache
|
|
import sys
|
|
import re
|
|
|
|
def parse_res(r, keys):
|
|
if isinstance(r, list):
|
|
return [{k:row[i] for i,k in enumerate(keys)} for row in r]
|
|
else:
|
|
return {k: r[i] for i,k in enumerate(keys)}
|
|
|
|
def convert_to_markdown(file):
|
|
with open(file) as f:
|
|
em = Parser(policy=policy.default).parse(f)
|
|
text = em.get_body(preferencelist=('plain')).as_string()
|
|
|
|
date=parsedate_to_datetime(em.get('Date'))
|
|
last_month = date - dateutil.relativedelta.relativedelta(months=1)
|
|
|
|
# See https://regex101.com/r/QRQ2Va/1 for regex sample text
|
|
re_clicks = r"(.*)\s+Clicks \(web\)"
|
|
re_impressions = r"(.*)\s+Impressions \(web\)"
|
|
re_url_stats = r"(?P<url>^http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$)\n+(?P<num>(?:\+?[\w\.]+))"
|
|
re_growing_queries = r"(?P<query>.*)\n+(?P<num>\+?[\d\.]+\w?) clicks \((?P<device>\w+)\)"
|
|
re_top_queries = r"(?P<query>.{6,})\n{2}(?P<num>\d\.\w+$)"
|
|
re_devices = r"Desktop Mobile Tablet\s+^(?P<desktop>(?:\d|\.)+[A-Z]?) (?P<mobile>(?:\d|\.)+[A-Z]?) (?P<tablet>(?:\d|\.)+[A-Z]?)$"
|
|
|
|
# Skip the first two matches
|
|
re_visit_breakdown = r"(?P<country>[A-Z].*)\n+(?P<c>(?:\d|\.|K|M)+$)\s+"
|
|
|
|
data = {
|
|
"year": last_month.year,
|
|
"month": last_month.month,
|
|
'date_formatted': last_month.strftime('%B %Y'),
|
|
'datetime': last_month,
|
|
"clicks": re.findall(re_clicks, text)[0],
|
|
"impressions": re.findall(re_impressions, text)[0],
|
|
"growing_pages": parse_res(re.findall(re_url_stats, text, re.M)[1:4], ['url', 'growth']),
|
|
"performing_pages": parse_res(re.findall(re_url_stats, text, re.M)[4:7], ['url', 'hits']),
|
|
"growing_queries": parse_res(re.findall(re_growing_queries, text), ['query', 'growth', 'device']),
|
|
"top_queries": parse_res(re.findall(re_top_queries, text, re.M), ['query', 'hits']),
|
|
"device_breakdown": parse_res(re.findall(re_devices, text, re.M)[0], ['web', 'mobile', 'tablet']),
|
|
"country_breakdown": parse_res(re.findall(re_visit_breakdown, text, re.M)[2:5], ['country', 'hits']),
|
|
"type_breakdown": parse_res(re.findall(re_visit_breakdown, text, re.M)[-3:], ['type', 'hits'])
|
|
}
|
|
|
|
with open('template.mustache', 'r') as tpl:
|
|
print(pystache.render(tpl.read(), data))
|
|
|
|
if __name__ == '__main__':
|
|
if(len(sys.argv) >= 2):
|
|
convert_to_markdown(sys.argv[1])
|
|
else:
|
|
print("Please run as python convert.py file.eml") |