Saltar al contenido

¿Cómo se valida una URL con una expresión regular en Python?

Nuestro grupo de expertos pasados ciertos días de investigación y de juntar de información, obtuvieron la solución, nuestro deseo es que resulte útil para ti para tu plan.

Solución:

Aquí está la expresión regular completa para analizar una URL.

(?:http://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).
)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+)
)3))(?::(?:d+))?)(?:/(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-F
d]2))|[;:@&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]
2))|[;:@&=])*))*)(?:?(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]
2))|[;:@&=])*))?)?)|(?:ftp://(?:(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?
:%[a-fA-Fd]2))|[;?&=])*)(?::(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-
fA-Fd]2))|[;?&=])*))[email protected])?(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-
)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?
:d+)(?:.(?:d+))3))(?::(?:d+))?))(?:/(?:(?:(?:(?:[a-zA-Zd$-_.+!
*'(),]|(?:%[a-fA-Fd]2))|[?:@&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'()
,]|(?:%[a-fA-Fd]2))|[?:@&=])*))*)(?:;type=[AIDaid])?)?)|(?:news:(?:
(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;/?:&=])[email protected](?:(?:(
?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[
a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3)))|(?:[a-zA-Z](
?:[a-zA-Zd]|[_.+-])*)|*))|(?:nntp://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[
a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd
])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?)/(?:[a-zA-Z](?:[a-zA-Z
d]|[_.+-])*)(?:/(?:d+))?)|(?:telnet://(?:(?:(?:(?:(?:[a-zA-Zd$-_.+
!*'(),]|(?:%[a-fA-Fd]2))|[;?&=])*)(?::(?:(?:(?:[a-zA-Zd$-_.+!*'()
,]|(?:%[a-fA-Fd]2))|[;?&=])*))[email protected])?(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a
-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd]
)?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?))/?)|(?:gopher://(?:(?:
(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:
(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+
))?)(?:/(?:[a-zA-Zd$-_.+!*'(),;/?:@&=]|(?:%[a-fA-Fd]2))(?:(?:(?:[
a-zA-Zd$-_.+!*'(),;/?:@&=]|(?:%[a-fA-Fd]2))*)(?:%09(?:(?:(?:[a-zA
-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;:@&=])*)(?:%09(?:(?:[a-zA-Zd$
-_.+!*'(),;/?:@&=]|(?:%[a-fA-Fd]2))*))?)?)?)?)|(?:wais://(?:(?:(?:
(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:
[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?
)/(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)(?:(?:/(?:(?:[a-zA
-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)/(?:(?:[a-zA-Zd$-_.+!*'(),]|(
?:%[a-fA-Fd]2))*))|?(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]
2))|[;:@&=])*))?)|(?:mailto:(?:(?:[a-zA-Zd$-_.+!*'(),;/?:@&=]|(?:%
[a-fA-Fd]2))+))|(?:file://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]
|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:
(?:d+)(?:.(?:d+))3))|localhost)?/(?:(?:(?:(?:[a-zA-Zd$-_.+!*'()
,]|(?:%[a-fA-Fd]2))|[?:@&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(
?:%[a-fA-Fd]2))|[?:@&=])*))*))|(?:prospero://(?:(?:(?:(?:(?:[a-zA-Z
d](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)
*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?)/(?:(?:(?:(?
:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&=])*)(?:/(?:(?:(?:[a-
zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&=])*))*)(?:(?:;(?:(?:(?:[
a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&])*)=(?:(?:(?:[a-zA-Zd
$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&])*)))*)|(?:ldap://(?:(?:(?:(?:
(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:
[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?
))?/(?:(?:(?:(?:(?:(?:(?:[a-zA-Zd]|%(?:3d|[46][a-fA-Fd]|[57][Aad])
)|(?:%20))+|(?:OID|oid).(?:(?:d+)(?:.(?:d+))*))(?:(?:%0[Aa])?(?:%2
0)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-F
d]2))*))(?:(?:(?:%0[Aa])?(?:%20)*)+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?
:(?:(?:[a-zA-Zd]|%(?:3d|[46][a-fA-Fd]|[57][Aad]))|(?:%20))+|(?:OID
|oid).(?:(?:d+)(?:.(?:d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])
?(?:%20)*))?(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)))*)(?:(
?:(?:(?:%0[Aa])?(?:%20)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))(?:(?:(?:(?:(
?:(?:[a-zA-Zd]|%(?:3d|[46][a-fA-Fd]|[57][Aad]))|(?:%20))+|(?:OID|o
id).(?:(?:d+)(?:.(?:d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(
?:%20)*))?(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*))(?:(?:(?:
%0[Aa])?(?:%20)*)+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?:(?:(?:[a-zA-Zd]|%(
?:3d|[46][a-fA-Fd]|[57][Aad]))|(?:%20))+|(?:OID|oid).(?:(?:d+)(?:
.(?:d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a
-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)))*))*(?:(?:(?:%0[Aa])?(?:%2
0)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))?)(?:?(?:(?:(?:(?:[a-zA-Zd$-_.+
!*'(),]|(?:%[a-fA-Fd]2))+)(?:,(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-f
A-Fd]2))+))*)?)(?:?(?:base|one|sub)(?:?(?:((?:[a-zA-Zd$-_.+!*'(
),;/?:@&=]|(?:%[a-fA-Fd]2))+)))?)?)?)|(?:(?:z39.50[rs])://(?:(?:(?
:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?
:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))
?)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))+)(?:+(?:(?:
[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))+))*(?:?(?:(?:[a-zA-Zd$-_
.+!*'(),]|(?:%[a-fA-Fd]2))+))?)?(?:;esn=(?:(?:[a-zA-Zd$-_.+!*'(),
]|(?:%[a-fA-Fd]2))+))?(?:;rs=(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA
-Fd]2))+)(?:+(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))+))*)
?))|(?:cid:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;?:@&=
])*))|(?:mid:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;?:@
&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;?:@&=]
)*))?)|(?:vemmi://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Z
d])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:
.(?:d+))3))(?::(?:d+))?)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a
-fA-Fd]2))|[/?:@&=])*)(?:(?:;(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a
-fA-Fd]2))|[/?:@&])*)=(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd
]2))|[/?:@&])*))*))?)|(?:imap://(?:(?:(?:(?:(?:(?:(?:[a-zA-Zd$-_.+
!*'(),]|(?:%[a-fA-Fd]2))|[&=~])+)(?:(?:;[Aa][Uu][Tt][Hh]=(?:*|(?:(
?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[&=~])+))))?)|(?:(?:;[
Aa][Uu][Tt][Hh]=(?:*|(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2
))|[&=~])+)))(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[
&=~])+))?))@)?(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])
?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:
d+))3))(?::(?:d+))?))/(?:(?:(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:
%[a-fA-Fd]2))|[&=~:@/])+)?;[Tt][Yy][Pp][Ee]=(?:[Ll](?:[Ii][Ss][Tt]|
[Ss][Uu][Bb])))|(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))
|[&=~:@/])+)(?:?(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[
&=~:@/])+))?(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-
9]d*)))?)|(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[&=~
:@/])+)(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-9]d*
)))?(?:/;[Uu][Ii][Dd]=(?:[1-9]d*))(?:(?:/;[Ss][Ee][Cc][Tt][Ii][Oo][Nn
]=(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[&=~:@/])+)))?))
)?)|(?:nfs:(?:(?://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-
Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:
.(?:d+))3))(?::(?:d+))?)(?:(?:/(?:(?:(?:(?:(?:[a-zA-Zd$-_.!~*'
(),])|(?:%[a-fA-Fd]2)|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Zd$-_.!~*'(),
])|(?:%[a-fA-Fd]2)|[:@&=+])*))*)?)))?)|(?:/(?:(?:(?:(?:(?:[a-zA-Zd
$-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Zd$
-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*))*)?))|(?:(?:(?:(?:(?:[a-zA-
Zd$-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Zd
$-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*))*)?)))

Dada su complejidad, creo que debería seguir el camino urlparse.

Para completar, aquí está el pseudo-BNF de la expresión regular anterior (como documentación):

; The generic form of a URL is:

genericurl     = scheme ":" schemepart

; Specific predefined schemes are defined here; new schemes
; may be registered with IANA

url            = httpurl | ftpurl | newsurl |
                 nntpurl | telneturl | gopherurl |
                 waisurl | mailtourl | fileurl |
                 prosperourl | otherurl

; new schemes follow the general syntax
otherurl       = genericurl

; the scheme is in lower case; interpreters should use case-ignore
scheme         = 1*[ lowalpha | digit | "+" | "-" | "." ]
schemepart     = *xchar | ip-schemepart


; URL schemeparts for ip based protocols:

ip-schemepart  = "//" login [ "/" urlpath ]

login          = [ user [ ":" password ] "@" ] hostport
hostport       = host [ ":" port ]
host           = hostname | hostnumber
hostname       = *[ domainlabel "." ] toplabel
domainlabel    = alphadigit | alphadigit *[ alphadigit | "-" ] alphadigit
toplabel       = alpha | alpha *[ alphadigit | "-" ] alphadigit
alphadigit     = alpha | digit
hostnumber     = digits "." digits "." digits "." digits
port           = digits
user           = *[ uchar | ";" | "?" | "&" | "=" ]
password       = *[ uchar | ";" | "?" | "&" | "=" ]
urlpath        = *xchar    ; depends on protocol see section 3.1

; The predefined schemes:

; FTP (see also RFC959)

ftpurl         = "ftp://" login [ "/" fpath [ ";type=" ftptype ]]
fpath          = fsegment *[ "/" fsegment ]
fsegment       = *[ uchar | "?" | ":" | "@" | "&" | "=" ]
ftptype        = "A" | "I" | "D" | "a" | "i" | "d"

; FILE

fileurl        = "file://" [ host | "localhost" ] "/" fpath

; HTTP

httpurl        = "http://" hostport [ "/" hpath [ "?" search ]]
hpath          = hsegment *[ "/" hsegment ]
hsegment       = *[ uchar | ";" | ":" | "@" | "&" | "=" ]
search         = *[ uchar | ";" | ":" | "@" | "&" | "=" ]

; GOPHER (see also RFC1436)

gopherurl      = "gopher://" hostport [ / [ gtype [ selector
                 [ "%09" search [ "%09" gopher+_string ] ] ] ] ]
gtype          = xchar
selector       = *xchar
gopher+_string = *xchar

; MAILTO (see also RFC822)

mailtourl      = "mailto:" encoded822addr
encoded822addr = 1*xchar               ; further defined in RFC822

; NEWS (see also RFC1036)

newsurl        = "news:" grouppart
grouppart      = "*" | group | article
group          = alpha *[ alpha | digit | "-" | "." | "+" | "_" ]
article        = 1*[ uchar | ";" | "/" | "?" | ":" | "&" | "=" ] "@" host

; NNTP (see also RFC977)

nntpurl        = "nntp://" hostport "/" group [ "/" digits ]

; TELNET

telneturl      = "telnet://" login [ "/" ]

; WAIS (see also RFC1625)

waisurl        = waisdatabase | waisindex | waisdoc
waisdatabase   = "wais://" hostport "/" database
waisindex      = "wais://" hostport "/" database "?" search
waisdoc        = "wais://" hostport "/" database "/" wtype "/" wpath
database       = *uchar
wtype          = *uchar
wpath          = *uchar

; PROSPERO

prosperourl    = "prospero://" hostport "/" ppath *[ fieldspec ]
ppath          = psegment *[ "/" psegment ]
psegment       = *[ uchar | "?" | ":" | "@" | "&" | "=" ]
fieldspec      = ";" fieldname "=" fieldvalue
fieldname      = *[ uchar | "?" | ":" | "@" | "&" ]
fieldvalue     = *[ uchar | "?" | ":" | "@" | "&" ]

; Miscellaneous definitions

lowalpha       = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" |
                 "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" |
                 "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" |
                 "y" | "z"
hialpha        = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
                 "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
                 "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
alpha          = lowalpha | hialpha
digit          = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
                 "8" | "9"
safe           = "$" | "-" | "_" | "." | "+"
extra          = "!" | "*" | "'" | "(" | ")" | ","
national       = " "" | "|" | "" | "^" | "~" | "[" | "]" | "`"
punctuation    = "" | "#" | "%" | 


reserved       = ";" | "/" | "?" | ":" | "@" | "&" | "="
hex            = digit | "A" | "B" | "C" | "D" | "E" | "F" |
                 "a" | "b" | "c" | "d" | "e" | "f"
escape         = "%" hex hex

unreserved     = alpha | digit | safe | extra
uchar          = unreserved | escape
xchar          = unreserved | reserved | escape
digits         = 1*digit

Una forma sencilla de analizar (y validar) las URL es la urlparse (py2, py3) módulo.

Una expresión regular es demasiado trabajo.


No existe un método de “validación” porque casi todo es una URL válida. Existen algunas reglas de puntuación para dividirlo. Sin ningún signo de puntuación, todavía tiene una URL válida.

Revise el RFC con cuidado y vea si puede construir una URL “no válida”. Las reglas son muy flexibles.

Por ejemplo ::::: es una URL válida. El camino es ":::::". Un nombre de archivo bastante estúpido, pero un nombre de archivo válido.

También, ///// es una URL válida. El netloc (“nombre de host”) es "". El camino es "///". De nuevo, estúpido. También válido. Esta URL se normaliza a "///" que es el equivalente.

Algo como "bad://///worse/////" es perfectamente válido. Tonta pero válida.

Línea de fondo. Analícelo y observe las piezas para ver si le desagradan de alguna manera.

¿Quiere que el esquema sea siempre “http”? ¿Quiere que netloc sea siempre “www.somename.somedomain”? ¿Quieres que la ruta se parezca a unix? ¿O como una ventana? ¿Quieres eliminar la cadena de consulta? ¿O conservarlo?

Estas no son validaciones especificadas por RFC. Estas son validaciones exclusivas de su aplicación.

Estoy usando el que usa Django y parece funcionar bastante bien:

def is_valid_url(url):
    import re
    regex = re.compile(
        r'^https?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]0,61[A-Z0-9])?.)+[A-Z]2,6.?|'  # domain...
        r'localhost|'  # localhost...
        r'd1,3.d1,3.d1,3.d1,3)' # ...or ip
        r'(?::d+)?'  # optional port
        r'(?:/?|[/?]S+)$', re.IGNORECASE)
    return url is not None and regex.search(url)

Siempre puede consultar la última versión aquí: https://github.com/django/django/blob/master/django/core/validators.py#L74

Agradecemos que desees añadir valor a nuestra información aportando tu experiencia en los comentarios.

¡Haz clic para puntuar esta entrada!
(Votos: 0 Promedio: 0)



Utiliza Nuestro Buscador

Deja una respuesta

Tu dirección de correo electrónico no será publicada. Los campos obligatorios están marcados con *