https://github.com/futuresea-dev/mapping-task
mapping task
https://github.com/futuresea-dev/mapping-task
abc dateutil html pydantic python requests scheduler typing
Last synced: 4 months ago
JSON representation
mapping task
- Host: GitHub
- URL: https://github.com/futuresea-dev/mapping-task
- Owner: futuresea-dev
- Created: 2021-08-24T18:16:45.000Z (almost 4 years ago)
- Default Branch: main
- Last Pushed: 2021-08-25T21:09:51.000Z (almost 4 years ago)
- Last Synced: 2024-12-29T11:44:43.376Z (5 months ago)
- Topics: abc, dateutil, html, pydantic, python, requests, scheduler, typing
- Language: Python
- Homepage:
- Size: 6.84 KB
- Stars: 1
- Watchers: 3
- Forks: 2
- Open Issues: 0
-
Metadata Files:
- Readme: README.md
Awesome Lists containing this project
README
# mapping-task
##
# strip html tag function
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()def main():
# define session
s = requests.Session()
# get all article list
api_list_url = "https://mapping-test.fra1.digitaloceanspaces.com/data/list.json"
r = s.get(api_list_url)# check status code
if r.status_code == 200:
# convert content to json
article_list = r.json()# for loop each article and get article id
for article in article_list:
data = {}
article_id = article["id"]# make article url and get article data
arc_data_url = "https://mapping-test.fra1.digitaloceanspaces.com/data/articles/{id}.json".format(id=article_id)
get_data = s.get(arc_data_url)
if get_data.status_code == 200:
# convert content to json
data = get_data.json()
categories = [data["category"]]
# modify data , add field depend on Article model
data["categories"] = categories
data["publication_date"] = datetime.strptime(data["pub_date"], '%Y-%m-%d-%H;%M;%S')
if "mod_date" in data:
data["modification_date"] = datetime.strptime(data["mod_date"], '%Y-%m-%d-%H:%M:%S')
else:
data["modification_date"] = datetime.now()# if sections's length big than 4 then slice it
if len(data["sections"]) > 4:
data["sections"] = data["sections"][:4]# make temp sections and put it
tmp_section = []
for idx, section in enumerate(data["sections"]):
tmp = {}# make each temp section
for k, v in section.items():
if k == "text":
tmp[k] = strip_tags(v)
elif k == "type" and v == "media":
tmp["text"] = "media"
tmp["type"] = "text"
else:
tmp[k] = v
tmp_section.append(tmp)# put sections
data["sections"] = tmp_section# get media data
arc_media_url = "https://mapping-test.fra1.digitaloceanspaces.com/data/media/{id}.json".format(id=article_id)
get_data = s.get(arc_media_url)# if status code == 200
if get_data.status_code == 200:
# get image, media data and modify section
img_data = get_data.json()# put image section
data["sections"].append(img_data[0])# modify media section and put section
img_data[1]["publication_date"] = datetime.strptime(img_data[1]["pub_date"], '%Y-%m-%d-%H;%M;%S')
if "mod_date" in img_data[1]:
img_data[1]["modification_date"] = datetime.strptime(img_data[1]["mod_date"], '%Y-%m-%d-%H:%M:%S')
else:
img_data[1]["modification_date"] = datetime.now()
data["sections"].append(img_data[1])
# else create empty section.
else:
image_section = {"type": "image", "url": "https://google.com", "alt": "none", "caption": "none",
"source": "none"}
media_section = {"type": "media", "id": "None", "url": "https://google.com", "thumbnail": "https"
"://google.com", "author": "none", "caption": "none", "publication_date": datetime.now(), "modification_date": datetime.now(), "duration": 0}
data["sections"].append(image_section)
data["sections"].append(media_section)
user = Article(**data)
print (user)
else:
pass
else:
print("invalid status : " + str(r.status_code))if __name__ == "__main__":
# create schedule running function every 5 minutes.
schedule.every(5).minutes.do(main)
while True:
schedule.run_pending()
# main()
# print ("done")