Nuestro grupo de expertos pasados ciertos días de investigación y de juntar de información, obtuvieron la solución, nuestro deseo es que resulte útil para ti para tu plan.
Solución:
Aquí está la expresión regular completa para analizar una URL.
(?:http://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).
)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+)
)3))(?::(?:d+))?)(?:/(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-F
d]2))|[;:@&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]
2))|[;:@&=])*))*)(?:?(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]
2))|[;:@&=])*))?)?)|(?:ftp://(?:(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?
:%[a-fA-Fd]2))|[;?&=])*)(?::(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-
fA-Fd]2))|[;?&=])*))[email protected])?(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-
)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?
:d+)(?:.(?:d+))3))(?::(?:d+))?))(?:/(?:(?:(?:(?:[a-zA-Zd$-_.+!
*'(),]|(?:%[a-fA-Fd]2))|[?:@&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'()
,]|(?:%[a-fA-Fd]2))|[?:@&=])*))*)(?:;type=[AIDaid])?)?)|(?:news:(?:
(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;/?:&=])[email protected](?:(?:(
?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[
a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3)))|(?:[a-zA-Z](
?:[a-zA-Zd]|[_.+-])*)|*))|(?:nntp://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[
a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd
])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?)/(?:[a-zA-Z](?:[a-zA-Z
d]|[_.+-])*)(?:/(?:d+))?)|(?:telnet://(?:(?:(?:(?:(?:[a-zA-Zd$-_.+
!*'(),]|(?:%[a-fA-Fd]2))|[;?&=])*)(?::(?:(?:(?:[a-zA-Zd$-_.+!*'()
,]|(?:%[a-fA-Fd]2))|[;?&=])*))[email protected])?(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a
-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd]
)?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?))/?)|(?:gopher://(?:(?:
(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:
(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+
))?)(?:/(?:[a-zA-Zd$-_.+!*'(),;/?:@&=]|(?:%[a-fA-Fd]2))(?:(?:(?:[
a-zA-Zd$-_.+!*'(),;/?:@&=]|(?:%[a-fA-Fd]2))*)(?:%09(?:(?:(?:[a-zA
-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;:@&=])*)(?:%09(?:(?:[a-zA-Zd$
-_.+!*'(),;/?:@&=]|(?:%[a-fA-Fd]2))*))?)?)?)?)|(?:wais://(?:(?:(?:
(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:
[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?
)/(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)(?:(?:/(?:(?:[a-zA
-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)/(?:(?:[a-zA-Zd$-_.+!*'(),]|(
?:%[a-fA-Fd]2))*))|?(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]
2))|[;:@&=])*))?)|(?:mailto:(?:(?:[a-zA-Zd$-_.+!*'(),;/?:@&=]|(?:%
[a-fA-Fd]2))+))|(?:file://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]
|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:
(?:d+)(?:.(?:d+))3))|localhost)?/(?:(?:(?:(?:[a-zA-Zd$-_.+!*'()
,]|(?:%[a-fA-Fd]2))|[?:@&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(
?:%[a-fA-Fd]2))|[?:@&=])*))*))|(?:prospero://(?:(?:(?:(?:(?:[a-zA-Z
d](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)
*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?)/(?:(?:(?:(?
:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&=])*)(?:/(?:(?:(?:[a-
zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&=])*))*)(?:(?:;(?:(?:(?:[
a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&])*)=(?:(?:(?:[a-zA-Zd
$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[?:@&])*)))*)|(?:ldap://(?:(?:(?:(?:
(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?:
[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))?
))?/(?:(?:(?:(?:(?:(?:(?:[a-zA-Zd]|%(?:3d|[46][a-fA-Fd]|[57][Aad])
)|(?:%20))+|(?:OID|oid).(?:(?:d+)(?:.(?:d+))*))(?:(?:%0[Aa])?(?:%2
0)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-F
d]2))*))(?:(?:(?:%0[Aa])?(?:%20)*)+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?
:(?:(?:[a-zA-Zd]|%(?:3d|[46][a-fA-Fd]|[57][Aad]))|(?:%20))+|(?:OID
|oid).(?:(?:d+)(?:.(?:d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])
?(?:%20)*))?(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)))*)(?:(
?:(?:(?:%0[Aa])?(?:%20)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))(?:(?:(?:(?:(
?:(?:[a-zA-Zd]|%(?:3d|[46][a-fA-Fd]|[57][Aad]))|(?:%20))+|(?:OID|o
id).(?:(?:d+)(?:.(?:d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(
?:%20)*))?(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*))(?:(?:(?:
%0[Aa])?(?:%20)*)+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?:(?:(?:[a-zA-Zd]|%(
?:3d|[46][a-fA-Fd]|[57][Aad]))|(?:%20))+|(?:OID|oid).(?:(?:d+)(?:
.(?:d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a
-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))*)))*))*(?:(?:(?:%0[Aa])?(?:%2
0)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))?)(?:?(?:(?:(?:(?:[a-zA-Zd$-_.+
!*'(),]|(?:%[a-fA-Fd]2))+)(?:,(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-f
A-Fd]2))+))*)?)(?:?(?:base|one|sub)(?:?(?:((?:[a-zA-Zd$-_.+!*'(
),;/?:@&=]|(?:%[a-fA-Fd]2))+)))?)?)?)|(?:(?:z39.50[rs])://(?:(?:(?
:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?).)*(?:[a-zA-Z](?:(?
:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:d+))3))(?::(?:d+))
?)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))+)(?:+(?:(?:
[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))+))*(?:?(?:(?:[a-zA-Zd$-_
.+!*'(),]|(?:%[a-fA-Fd]2))+))?)?(?:;esn=(?:(?:[a-zA-Zd$-_.+!*'(),
]|(?:%[a-fA-Fd]2))+))?(?:;rs=(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA
-Fd]2))+)(?:+(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))+))*)
?))|(?:cid:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;?:@&=
])*))|(?:mid:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;?:@
&=])*)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[;?:@&=]
)*))?)|(?:vemmi://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Z
d])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:
.(?:d+))3))(?::(?:d+))?)(?:/(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a
-fA-Fd]2))|[/?:@&=])*)(?:(?:;(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a
-fA-Fd]2))|[/?:@&])*)=(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd
]2))|[/?:@&])*))*))?)|(?:imap://(?:(?:(?:(?:(?:(?:(?:[a-zA-Zd$-_.+
!*'(),]|(?:%[a-fA-Fd]2))|[&=~])+)(?:(?:;[Aa][Uu][Tt][Hh]=(?:*|(?:(
?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[&=~])+))))?)|(?:(?:;[
Aa][Uu][Tt][Hh]=(?:*|(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2
))|[&=~])+)))(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[
&=~])+))?))@)?(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])
?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:.(?:
d+))3))(?::(?:d+))?))/(?:(?:(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:
%[a-fA-Fd]2))|[&=~:@/])+)?;[Tt][Yy][Pp][Ee]=(?:[Ll](?:[Ii][Ss][Tt]|
[Ss][Uu][Bb])))|(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))
|[&=~:@/])+)(?:?(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[
&=~:@/])+))?(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-
9]d*)))?)|(?:(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[&=~
:@/])+)(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-9]d*
)))?(?:/;[Uu][Ii][Dd]=(?:[1-9]d*))(?:(?:/;[Ss][Ee][Cc][Tt][Ii][Oo][Nn
]=(?:(?:(?:[a-zA-Zd$-_.+!*'(),]|(?:%[a-fA-Fd]2))|[&=~:@/])+)))?))
)?)|(?:nfs:(?:(?://(?:(?:(?:(?:(?:[a-zA-Zd](?:(?:[a-zA-Zd]|-)*[a-zA-
Zd])?).)*(?:[a-zA-Z](?:(?:[a-zA-Zd]|-)*[a-zA-Zd])?))|(?:(?:d+)(?:
.(?:d+))3))(?::(?:d+))?)(?:(?:/(?:(?:(?:(?:(?:[a-zA-Zd$-_.!~*'
(),])|(?:%[a-fA-Fd]2)|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Zd$-_.!~*'(),
])|(?:%[a-fA-Fd]2)|[:@&=+])*))*)?)))?)|(?:/(?:(?:(?:(?:(?:[a-zA-Zd
$-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Zd$
-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*))*)?))|(?:(?:(?:(?:(?:[a-zA-
Zd$-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Zd
$-_.!~*'(),])|(?:%[a-fA-Fd]2)|[:@&=+])*))*)?)))
Dada su complejidad, creo que debería seguir el camino urlparse.
Para completar, aquí está el pseudo-BNF de la expresión regular anterior (como documentación):
; The generic form of a URL is: genericurl = scheme ":" schemepart ; Specific predefined schemes are defined here; new schemes ; may be registered with IANA url = httpurl | ftpurl | newsurl | nntpurl | telneturl | gopherurl | waisurl | mailtourl | fileurl | prosperourl | otherurl ; new schemes follow the general syntax otherurl = genericurl ; the scheme is in lower case; interpreters should use case-ignore scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] schemepart = *xchar | ip-schemepart ; URL schemeparts for ip based protocols: ip-schemepart = "//" login [ "/" urlpath ] login = [ user [ ":" password ] "@" ] hostport hostport = host [ ":" port ] host = hostname | hostnumber hostname = *[ domainlabel "." ] toplabel domainlabel = alphadigit | alphadigit *[ alphadigit | "-" ] alphadigit toplabel = alpha | alpha *[ alphadigit | "-" ] alphadigit alphadigit = alpha | digit hostnumber = digits "." digits "." digits "." digits port = digits user = *[ uchar | ";" | "?" | "&" | "=" ] password = *[ uchar | ";" | "?" | "&" | "=" ] urlpath = *xchar ; depends on protocol see section 3.1 ; The predefined schemes: ; FTP (see also RFC959) ftpurl = "ftp://" login [ "/" fpath [ ";type=" ftptype ]] fpath = fsegment *[ "/" fsegment ] fsegment = *[ uchar | "?" | ":" | "@" | "&" | "=" ] ftptype = "A" | "I" | "D" | "a" | "i" | "d" ; FILE fileurl = "file://" [ host | "localhost" ] "/" fpath ; HTTP httpurl = "http://" hostport [ "/" hpath [ "?" search ]] hpath = hsegment *[ "/" hsegment ] hsegment = *[ uchar | ";" | ":" | "@" | "&" | "=" ] search = *[ uchar | ";" | ":" | "@" | "&" | "=" ] ; GOPHER (see also RFC1436) gopherurl = "gopher://" hostport [ / [ gtype [ selector [ "%09" search [ "%09" gopher+_string ] ] ] ] ] gtype = xchar selector = *xchar gopher+_string = *xchar ; MAILTO (see also RFC822) mailtourl = "mailto:" encoded822addr encoded822addr = 1*xchar ; further defined in RFC822 ; NEWS (see also RFC1036) newsurl = "news:" grouppart grouppart = "*" | group | article group = alpha *[ alpha | digit | "-" | "." | "+" | "_" ] article = 1*[ uchar | ";" | "/" | "?" | ":" | "&" | "=" ] "@" host ; NNTP (see also RFC977) nntpurl = "nntp://" hostport "/" group [ "/" digits ] ; TELNET telneturl = "telnet://" login [ "/" ] ; WAIS (see also RFC1625) waisurl = waisdatabase | waisindex | waisdoc waisdatabase = "wais://" hostport "/" database waisindex = "wais://" hostport "/" database "?" search waisdoc = "wais://" hostport "/" database "/" wtype "/" wpath database = *uchar wtype = *uchar wpath = *uchar ; PROSPERO prosperourl = "prospero://" hostport "/" ppath *[ fieldspec ] ppath = psegment *[ "/" psegment ] psegment = *[ uchar | "?" | ":" | "@" | "&" | "=" ] fieldspec = ";" fieldname "=" fieldvalue fieldname = *[ uchar | "?" | ":" | "@" | "&" ] fieldvalue = *[ uchar | "?" | ":" | "@" | "&" ] ; Miscellaneous definitions lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" hialpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" alpha = lowalpha | hialpha digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" safe = "$" | "-" | "_" | "." | "+" extra = "!" | "*" | "'" | "(" | ")" | "," national = " "" | "|" | "" | "^" | "~" | "[" | "]" | "`" punctuation = "" | "#" | "%" | reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | "a" | "b" | "c" | "d" | "e" | "f" escape = "%" hex hex unreserved = alpha | digit | safe | extra uchar = unreserved | escape xchar = unreserved | reserved | escape digits = 1*digit
Una forma sencilla de analizar (y validar) las URL es la urlparse
(py2, py3) módulo.
Una expresión regular es demasiado trabajo.
No existe un método de “validación” porque casi todo es una URL válida. Existen algunas reglas de puntuación para dividirlo. Sin ningún signo de puntuación, todavía tiene una URL válida.
Revise el RFC con cuidado y vea si puede construir una URL “no válida”. Las reglas son muy flexibles.
Por ejemplo :::::
es una URL válida. El camino es ":::::"
. Un nombre de archivo bastante estúpido, pero un nombre de archivo válido.
También, /////
es una URL válida. El netloc (“nombre de host”) es ""
. El camino es "///"
. De nuevo, estúpido. También válido. Esta URL se normaliza a "///"
que es el equivalente.
Algo como "bad://///worse/////"
es perfectamente válido. Tonta pero válida.
Línea de fondo. Analícelo y observe las piezas para ver si le desagradan de alguna manera.
¿Quiere que el esquema sea siempre “http”? ¿Quiere que netloc sea siempre “www.somename.somedomain”? ¿Quieres que la ruta se parezca a unix? ¿O como una ventana? ¿Quieres eliminar la cadena de consulta? ¿O conservarlo?
Estas no son validaciones especificadas por RFC. Estas son validaciones exclusivas de su aplicación.
Estoy usando el que usa Django y parece funcionar bastante bien:
def is_valid_url(url):
import re
regex = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]0,61[A-Z0-9])?.)+[A-Z]2,6.?|' # domain...
r'localhost|' # localhost...
r'd1,3.d1,3.d1,3.d1,3)' # ...or ip
r'(?::d+)?' # optional port
r'(?:/?|[/?]S+)$', re.IGNORECASE)
return url is not None and regex.search(url)
Siempre puede consultar la última versión aquí: https://github.com/django/django/blob/master/django/core/validators.py#L74
Agradecemos que desees añadir valor a nuestra información aportando tu experiencia en los comentarios.