-
-
Save mattn/5004875 to your computer and use it in GitHub Desktop.
Index: twitvim.vim | |
=================================================================== | |
--- twitvim.vim (revision 339) | |
+++ twitvim.vim (working copy) | |
@@ -2696,18 +2696,13 @@ | |
let s:URL_PROTOCOL_HTTPS = '\%([Hh][Tt][Tt][Pp][Ss]\)://' | |
let s:URL_PROTOCOL_NON_HTTPS = '\%([Hh][Tt][Tt][Pp]\|[Ff][Tt][Pp]\)://' | |
-let s:URL_DOMAIN = '[^[:space:])/]\+' | |
-let s:URL_PATH_CHARS = '[^[:space:]()]' | |
+let s:URL_DOMAIN = '\a[a-zA-Z0-9_-]*\(\.[a-zA-Z0-9][a-zA-Z0-9_-]*\)*\(:\d+\)\{0,1}' | |
+let s:URL_PATH_CHARS = '[a-zA-Z0-9_/.\-+%#?&=;@$,!''*~]' | |
" URL paths may contain balanced parentheses. | |
let s:URL_PARENS = '('.s:URL_PATH_CHARS.'*)' | |
+let s:URL_PATH = '\%('.s:URL_PATH_CHARS.'*\%('.s:URL_PARENS.s:URL_PATH_CHARS.'*\)*\)\|\%('.s:URL_PATH_CHARS.'\+\)' | |
-" Avoid swallowing up certain punctuation characters after a URL but allow a | |
-" URL to end with a balanced parenthesis. | |
-let s:URL_PATH_END = '\%([^[:space:]\.,;:()]\|'.s:URL_PARENS.'\)' | |
- | |
-let s:URL_PATH = '\%('.s:URL_PATH_CHARS.'*\%('.s:URL_PARENS.s:URL_PATH_CHARS.'*\)*'.s:URL_PATH_END.'\)\|\%('.s:URL_PATH_CHARS.'\+\)' | |
- | |
" Bring it all together. Use this regex to match a URL. | |
let s:URLMATCH = s:URL_PROTOCOL.s:URL_DOMAIN.'\%(/\%('.s:URL_PATH.'\)\=\)\=' | |
let s:URLMATCH_HTTPS = s:URL_PROTOCOL_HTTPS.s:URL_DOMAIN.'\%(/\%('.s:URL_PATH.'\)\=\)\=' | |
@@ -2724,7 +2719,7 @@ | |
" Handle @-replies by showing that user's timeline. | |
" An @-reply must be preceded by a non-word character and ends at a | |
" non-word character. | |
- let matchres = matchlist(s, '\w\@<!@\(\w\+\)') | |
+ let matchres = matchlist(s, '[\w]\@<!@\(\w\+\)') | |
if matchres != [] | |
call s:get_timeline("user", matchres[1], 1, 0) | |
return | |
@@ -2985,7 +2980,7 @@ | |
" An @-reply must be preceded by a non-word character and ends at a | |
" non-word character. | |
- syntax match twitterReply "\w\@<!@\w\+" | |
+ syntax match twitterReply "[\w]\@<!@\w\+" | |
" A #-hashtag must be preceded by a non-word character and ends at a | |
" non-word character. |
mattn
commented
Feb 21, 2013
This broken syntax shows with following text:
.. “@mattn_jp: “Big Sky :: mruby から正規表現が消え、正規表現がやってきた。” http://htn.to/9puex1”
What encoding do you use?
The second problem doesn't show up in utf-8.
I don't think [\w] does what you think it does. It matches only \ and w. Better question would be what is coming before @ in that encoding you are using? FYI, it looked fine in euc-jp too.
What encoding do you use?
The second problem doesn't show up in utf-8.
CP932 but this problem occur on UTF-8 also.
At the first, are you ok about URL issue? Currently, you have question only for \w
?
CP932 is DBCS(double byte character set). And the trail-byte of CP932 is possible to contains ASCII range. For example, “
on CP932 is 0x81 0x67
.
I checked in some changes to tighten up URL recognition a bit because the first problem happens in UTF-8 too.
I do not see the "@name problem under UTF-8, so it's just cp932.
CP932 is store two bytes like below
leading byte: 0xa0 - 0xfe
trailing byte: 0x20 - 0x7e
So you can't use \w
for \@<!
, But I think this is vim's bug.
But you shouldn't use [^[:space:]]
for URL path.
Ah, This all of issue causes vim's bug. Below is a patch. I'll send this patch to vim-dev after some checking.
diff -r 8b86b69546a9 src/regexp.c
--- a/src/regexp.c Wed Feb 20 21:26:00 2013 +0100
+++ b/src/regexp.c Fri Feb 22 10:40:23 2013 +0900
@@ -5449,7 +5449,19 @@
}
}
else
- --rp->rs_un.regsave.rs_u.pos.col;
+ {
+#ifdef FEAT_MBYTE
+ if (has_mbyte)
+ {
+ int off = (*mb_head_off)(regline,
+ regline +
+ rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
+ rp->rs_un.regsave.rs_u.pos.col -= off;
+ }
+ else
+#endif
+ --rp->rs_un.regsave.rs_u.pos.col;
+ }
}
else
{