Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wwwanlingxiao
public-apis
Commits
7be0512b
Unverified
Commit
7be0512b
authored
Jan 12, 2022
by
Matheus Felipe
Browse files
Check if a link is working
parent
d06a3717
Changes
1
Hide whitespace changes
Inline
Side-by-side
scripts/validate/links.py
View file @
7be0512b
# -*- coding: utf-8 -*-
import
sys
import
re
import
sys
import
random
from
typing
import
List
,
Tuple
import
requests
def
find_links_in_text
(
text
:
str
)
->
List
[
str
]:
"""Find links in a text and return a list of URLs."""
...
...
@@ -55,6 +58,80 @@ def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
return
(
has_duplicate
,
duplicates
)
def
fake_user_agent
()
->
str
:
"""Faking user agent as some hosting services block not-whitelisted UA."""
user_agents
=
[
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)'
,
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
,
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
,
]
return
random
.
choice
(
user_agents
)
def
get_host_from_link
(
link
:
str
)
->
str
:
host
=
link
.
split
(
'://'
,
1
)[
1
]
# Remove routes, arguments and anchors
if
'/'
in
host
:
host
=
host
.
split
(
'/'
,
1
)[
0
]
elif
'?'
in
host
:
host
=
host
.
split
(
'?'
,
1
)[
0
]
elif
'#'
in
host
:
host
=
host
.
split
(
'#'
,
1
)[
0
]
return
host
def
check_if_link_is_working
(
link
:
str
)
->
Tuple
[
bool
,
str
]:
"""Checks if a link is working.
If an error is identified when the request for the link occurs,
the return will be a tuple with the first value True and the second
value a string containing the error message.
If no errors are identified, the return will be a tuple with the
first value False and the second an empty string.
"""
has_error
=
False
error_message
=
''
try
:
resp
=
requests
.
get
(
link
+
'/'
,
timeout
=
25
,
headers
=
{
'User-Agent'
:
fake_user_agent
(),
'host'
:
get_host_from_link
(
link
)
})
code
=
resp
.
status_code
if
code
>=
400
:
has_error
=
True
error_message
=
f
'ERR:CLT:
{
code
}
:
{
link
}
'
except
(
TimeoutError
,
requests
.
exceptions
.
ConnectTimeout
):
has_error
=
True
error_message
=
f
'ERR:TMO:
{
link
}
'
except
requests
.
exceptions
.
SSLError
as
error
:
has_error
=
True
error_message
=
f
'ERR:SSL:
{
error
}
:
{
link
}
'
except
requests
.
exceptions
.
TooManyRedirects
as
error
:
has_error
=
True
error_message
=
f
'ERR:TMR:
{
error
}
:
{
link
}
'
except
Exception
as
error
:
has_error
=
True
error_message
=
f
'ERR:UKN:
{
error
}
:
{
link
}
'
return
(
has_error
,
error_message
)
if
__name__
==
'__main__'
:
num_args
=
len
(
sys
.
argv
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment