Skip to content

Instantly share code, notes, and snippets.

@JoeGlines
Created July 13, 2020 12:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JoeGlines/6f1c0710fa4abc2bcf751dd2108c62d9 to your computer and use it in GitHub Desktop.
Save JoeGlines/6f1c0710fa4abc2bcf751dd2108c62d9 to your computer and use it in GitHub Desktop.
Rip a page and shove it into Excel, then verify links, etc.
#SingleInstance, Force
#NoEnv
SetBatchLines -1 ;run as fast as possible
;~ DetectHiddenWindows, On
;~ ListLines On ;on helps debug a script
SendMode Input ; Recommended for new scripts due to its superior speed and reliability.
Menu, tray, icon, B:\Progs\AutoHotKey_l\Icons\Win\ico_shell32_dll0210.ico, 1
Menu, Tray, Add, Change ID (Control + I), Change_ID
Browser_Forward::Reload
Browser_Back::
GoSub Change_ID
return
; to do
; 1) put in loop
; 2) break out root between UAT and ti.com
; 3) compare uat to ti.com
Run_It:
TopCounts:=15 ;control # of top words on page extracted
Wait_For_Server:=2000 ;wait secons for server to respond to ping
gosub Extract_HTML
gosub Site_Catalyst
gosub Word_Count
gosub Import_Excel
IfEqual, verify,1,gosub Verify_href
gosub End
return
Change_ID:
^i::
pwb:=GetIE()
all:=pwb.document.all ;.tags("option") ;.tags("A")
while (A_Index<=all.length)
if all[A_Index-1].id ;.options ;leaving blank means it exists
ids.=all[A_Index-1].id . "|"
StringTrimRight, ids, ids, 1
Gui,Add, Button,Default, Submit
Gui,Add, Button,x+20 ,Cancel
Gui,Add, Radio, x+20 vVerify group , Verify URLs
Gui,Add, Radio, Checked y+10 , Don't Verify URLs
Gui,Add, DropDownList, x6 y+10 w200 vBody_ID , %ids% ;Black|White|Red|Green|Blue
GuiControl,ChooseString, body_id, ls-row-3-col-2 ;Try and select this one by default
Gui Show, h75 w250, Test
ids:="" ;Clear Ids
return
ButtonCancel:
Gui, Destroy
Return
ButtonSubmit:
Gui, Submit
Gui, Destroy
gosub Run_It
return
;*******************************************************
;*********************Webpage**********************************
;*******************************************************
Extract_HTML:
StartTime := A_TickCount ;Get time started to check how long it takes
pwb:=GetIE()
;~ pwb:=setWbCom(name:="",url:="http://www.ti.com/") ;get pointer TI
;~ to do ; give error if ie not running- make sure it is ie running and on page
url:=pwb.locationURL
;**********************get tab title and trim / replace for illegal chars*********************************
Full_Tab_Title:= pwb.Document.title ;getting tab title
StringReplace, Tab_Title, Full_Tab_Title, - TI.com,, ;trimming - TI.com as redundant and need to be shorter
Tab_Title:= RegExReplace(Tab_Title, "[#/\\:&\*\?\{<>|\]\.]", "_") ;replace illegal chars www.autohotkey.com/community/viewtopic.php?f=1&t=13544&hilit=regexreplace
Tab_Title := RegExReplace(Tab_Title, "_+", " ") All ; replace mutliple _ with space
Tab_Title :=RegExReplace(Tab_Title,".*?-(.*)","$1")
;~ MsgBox,,title, % Tab_Title
StringLeft,Tab_Title,Tab_Title,31 ;trim to first 31 charachters ;~ MsgBox,,title, % Tab_Title
;*************************Grab just inner framework area******************************
Test_Section:= pwb.document.getElementByID(Body_ID)
if (Test_Section="") {
MsgBox % "The ID / element could not be found on this page:`n`n" Tab_Title "`n" url
return
}
;*******************************************************
Tag:="a" ;what tags looking for. Should do other than just a?
Test_Section_CT:=Test_Section.all.tags(TAG).length ;count of all tags under above
msg:="Links=" Test_Section_CT "`t`t" Full_Tab_Title "`t" URL "`r" ;~ MsgBox,,title, % msg
msg.="Link #`tName`tInnerText`thref`tStatus`tOuterHTML`n"
;**********************wrap line breaks in Innertext with quotes*********************************
loop %Test_Section_CT% {
Inner_Text:= Test_Section.all.tags(TAG)[A_Index-1].InnerText
Test_Section.all.tags(TAG)[A_Index-1].outerhtml:="<span style='color:blue'>" A_Index . ") </span>" . Test_Section.all.tags(TAG)[A_Index-1].OuterHTML ;added this line
IfInString,Inner_Text,`n
{ ;line break in text
StringReplace, Inner_Text, Inner_Text, `n, , All
StringReplace, Inner_Text, Inner_Text, `r, (chr10), All
Run_Search_Replace:=1
}
;**********************Href-*********************************
href:= Test_Section.all.tags(TAG)[A_Index-1].href
name:= Test_Section.all.tags(TAG)[A_Index-1].name
;**********************remove line breaks in HTML *********************************
Outer_HTML:= Test_Section.all.tags(TAG)[A_Index-1].OuterHTML
IfInString,Outer_HTML,`n
{
StringReplace, Outer_HTML, Outer_HTML, `r`n, , All
StringReplace, Outer_HTML, Outer_HTML, `n, , All
StringReplace, Outer_HTML, Outer_HTML, `r, , All
Outer_HTML:="""" . Outer_HTML . """"
}
;**********************remove tabs*********************************
IfInString,Outer_HTML,%tab%
{
StringReplace, Outer_HTML, Outer_HTML,%A_Tab%, , All
Outer_HTML:="""" . Outer_HTML . """"
}
msg.=A_index "`t" name "`t" Inner_Text "`t" href "`t`t" Outer_HTML "`n"
Line_Break_in_Text= , name=
}
Clipboard:=msg
msg=
SplitPath, url, Page_name, dir, ext, name_no_ext, drive
File_Name:=dir page_name
StringReplace, File_name, File_name, http://,,
StringReplace, File_name, File_name, /,_,,all
FileDelete, %file_name%.html
page:=pwb.document.documentElement.OuterHTML
HTML_page =
( Ltrim Join
<!DOCTYPE html>
<html>
<head>
</head>
<body>
%page%
</body>
</html>
)
FileAppend, %HTML_page%,%A_ScriptDir%\%file_name%.html,UTF-8
return
;*******************************************************
;**********************Excel*********************************
;**************************************************
Import_Excel:
;~ path:=A_ScriptDir "\" Text_File ;~ MsgBox,,title, % path
Sleep 200
;~ xl:=XL_Start_Get(XL,1) ;WRB is pointer to workbook, Vis=0 for hidden Try=0 for new Excel
try
{
;~ XL := ComObjActive("Excel.Application") ;handle
XL:=XL_Handle(1) ;XL_Handle(XL,1) ;1=Application 2=Workbook 3=Worksheet
xl.Worksheets.Add().Name := Tab_Title
} Catch {
XL := ComObjCreate("Excel.Application") ;handle
XL.Visible := 1 ;1=Visible/Default 0=hidden
sleep, 500
xl.Workbooks.Add
Sleep, 200
xl.Worksheets.Add().Name := Tab_Title
}
WinActivate, ahk_class XLMAIN
Sleep 200
XL_Paste2(XL,Dest_RG:="a1",Paste:=1)
Header_RG:="A1:E2" ;Set header range
;**********************set tab title to reflect page title*********************************
XL.Application.ActiveSheet.Range("B1").value:= Page_Name ;page name for Site Catalyst
;~ XL_Add_Comment(XL,RG:="b1",Comment:=Content_Group,Vis:=0,Size:=11,Font:="Book Antique",ForeClr:=5)
XL_Insert_Comment(XL,RG:="b1",Comment:=Content_Group,Vis:=0,Size:=11,Font:="Arial",ForeClr:=5)
XL.Application.ActiveSheet.Range("F1").value:= top_words ;page name for Site Catalyst
XL_Insert_Comment(XL,RG:="F1",Comment:="Top " TopCounts " words on page",Vis:=0,Size:=11,Font:="Book Antique",ForeClr:=5)
XL_Freeze(XL,Row:="2") ;Col A will not include cols which is default so leave out if unwanted
LR:=XL_Last_Row(XL)
XL_Format_HAlign(XL,RG:=Header_RG,h:=2) ;1=Left 2=Center 3=Right
XL_Format_VAlign(XL,RG:=Header_RG,v:=4) ;1=Top 2=Center 3=Distrib 4=Bottom
XL_Format_Font(XL,RG:=Header_RG,Font:="Arial Narrow",Size:=11) ;Arial, Arial Narrow, Calibri
XL.Range("A1:F1").Interior.ColorIndex := 19 ;Shade header row yellow
XL.Range("A2:F2").Interior.ColorIndex := 6 ;Intense Yellow
XL.Range("A1:F2").Font.Bold := 1 ;Bold
XL_Border(XL,RG:=Header_RG,Weight:=2,Line:=2) ;1=Hairline 2=Thin 3=Med 4=Thick ;Line1=Solid 2=Dash 4=DashDot 5=DashDotDot
XL_Row_Height(XL,RG:="1:" LR "=-1") ;rows first then height -1 is auto
XL_Col_Width_Set(XL,RG:="A=10|B=30|C=30|D=90|E=8|F=175") ;-1 is auto
;**********************replace (chr10) with <br>*********************************
if (Run_Search_Replace =1)
XL.Range("C2:C" LR).Replace("(chr10)",Chr(10)) ;need to convert to function
;**********************hyperlink*********************************
XL_Hyperlink_Offset_Col2(XL,RG:="a3:a" LR,URL:="3",Freindly:="0") ;Neg values are rows Above/ Pos are Rows below
return
;**********************verify url in href*********************************
Verify_href:
XL:=XL_Handle(1) ;XL_Handle(XL,1) ;1=Application 2=Workbook 3=Worksheet
LR:=XL_Last_Row(XL)
Verify_Link(XL,RG:="D3:D" LR ,Col_Dest:=2)
return
;*******************************************************
;**********************Content*********************************
;*******************************************************
Content:
pwb:=GetIE()
TAG:="DIV"
;~ msgbox % pwb.document.getElementByID(Body_ID).getElementsByTagName("A")[0].innerTEXT
Div_CT:=pwb.document.getElementByID(Body_ID).All.Tags(TAG).length -1 ;[0].innerTEXT
Grp:=pwb.document.getElementByID(Body_ID).All.Tags(TAG)
Loop, %Div_CT% { ;loop over all Div
Text.= "`n" . Grp[A_Index].Innertext ;append with line break
}
Text:= RegExReplace(text, "(^|\R)\K\s+") ;remove blank lines
Xl.Sheets.Add ; Worksheet.Add ;add a new workbook
xl.activesheet.Name := "Content"
Clipboard:=text
XL.Application.ActiveSheet.Range("A1").PasteSpecial()
return
;**********************Site Catalyst *********************************
Site_Catalyst:
pwb:=GetIE()
text := pwb.document.documentElement.innerHTML
RegExMatch(text,"tiPageName\s?=\s?""(.*?)"";",Page_Name) ; making it greedy so it gets the last one,not the first one
StringLower,Page_Name,Page_Name1
RegExMatch(text,"tiContentGroup\s?=\s?""(.*?)"";",Content_Group) ; making it greedy so it gets the last one,not the first one
StringLower,Content_Group,Content_Group1
return
;**********************page content- most freq value count*********************************
Word_Count:
pwb:=GetIE()
text:=pwb.Document.body.innertext
top_words:=DuplicateFinderAndCounter(text,TopCounts)
return
;*******************************************************
;**********************End*********************************
;*******************************************************
end:
EndTime := A_TickCount
Elapsed:=EndTime - StartTime ;~ timetook:=MStoM(Elapsed)
MsgBox, % "Verification is done and it took " MStoM(Elapsed) ; Returns 945m 46s
return
;**********************Functions*********************************
;**********************Insert hyperlinks in Excel*********************************
XL_Hyperlink_Offset_Col2(PXL,RG="",URL="",Freindly=""){
For Cell in PXL.Application.ActiveSheet.Range(RG){
if (Cell.offset(0,URL).value !="")
Cell.Value:="=Hyperlink(""" . Cell.offset(0,URL).value . """,""" . Round((Cell.Offset(0,Freindly).Value)) . """)"
}}
;**********************verify URL by pinging*********************************
Verify_Link(PXL,RG="",Col_Dest=""){
For Cell in PXL.Application.ActiveSheet.Range(RG){
url:=Cell.value
RegExMatch(url,"^(?P<start>.*?)(?P<end>[?|#].*)?$",URL_) ;breakout parts after URL
url:=RTrim(url, "/") ; trim /
url:=RTrim(url, "#") ; trim pound
if (url="") or (url="#")
Continue ;don't verify if missing
IfInString, url, javascript
Continue ;don't verify if javascript
type:="GET"
ComObjError(false)
WebRequest := ComObjCreate("WinHttp.WinHttpRequest.5.1")
WebRequest.Open(Type, URL_Start)
WebRequest.SetRequestHeader("Accept", "text/html;charset=utf-8")
WebRequest.SetRequestHeader("Referer",URL) ;set refering site to url
Cookie=
(
JSESSIONID=3F777724E42CC7EDE6FA8F37D513E038.node13; tidomain=www.ti.com; gpv_p9_o=rf430 learn nfc tab-en; s_cc=true; AP_COOKIE_EN=computerId-C_EN_286630705&geoStateCode-TX&ipGeoMapDate-1426850455286&expiryDate-1458386461914&lastVisitedDate-1426850461914&geoRegion-Americas&createdDate-1425302610354&geoCountryCode-US&ipAddress-156.117.61.214&; AB_TECHDOC_EN=%7C0%7C; AB_PREFERENCE_EN=Y; PROMO_TRACKER_EN=TM4C1230C3PM_17_en_1_2;
)
WebRequest.SetRequestHeader("Cookie", Cookie)
IfWinExist, Fiddler
WebRequest.SetProxy(2,"localhost:8888") ;turn off if Fiddler not running
Try {
WebRequest.Send() ;temporarily removed- kept having issues
WebRequest.WaitForResponse(Wait_For_Server) ;wait upto 5 seconds for response
Text:=WebRequest.StatusText
Status:=WebRequest.Status ;numeric value ;~ Status_Text:=WebRequest.StatusText ;text
} Catch {
Text:="error"
Status:="not tested"
}
Cell.offset(0,1).Value:=Text "/" Status
if (status ="200")
Cell.offset(0,1).Interior.ColorIndex := 4 ;green
else if (status ="need to verify manually")
Cell.offset(0,1).Interior.ColorIndex := 6 ;yellow
Else Cell.offset(0,1).Interior.ColorIndex := 3 ;green
}
ComObjError(true)
}
;**********************paste into excel*********************************
XL_Paste2(PXL,Dest_RG="",Paste=""){ ;1=All 2=Values 3=Comments 4=Formats 5=Formulas 6=Validation 7=All Except Borders
;8=Col Widths 11=Formulas and Number formats 12=Values and Number formats
IfEqual,Paste,1,SetEnv,Paste,-4104 ;xlPasteAll
IfEqual,Paste,2,SetEnv,Paste,-4163 ;xlPasteValues
IfEqual,Paste,3,SetEnv,Paste,-4144 ;xlPasteComments
IfEqual,Paste,4,SetEnv,Paste,-4122 ;xlPasteFormats
IfEqual,Paste,5,SetEnv,Paste,-4123 ;xlPasteFormulas
PXL.Application.ActiveSheet.Range(Dest_RG).PasteSpecial(Paste)
}
;**********************get IE*********************************
GetIE(Name="") { ; GetIE(tab_name)
If(Name) {
WinGet, winList, List, ahk_class IEFrame
While(winList%A_Index% && !m) {
n := A_Index, ErrorLevel := 0
While(!ErrorLevel && !m) {
ControlGetText, tabText, TabWindowClass%A_Index%, % "ahk_id" winList%n%
If InStr(tabText, Name)
m := A_Index ; win hwnd = winList%n%
} }
ControlGet, hIESvr, hWnd, , Internet Explorer_Server%m%, % "ahk_id" winList%n%
} Else ControlGet, hIESvr, hWnd, , Internet Explorer_Server1, ahk_class IEFrame ; get Active IE
If Not hIESvr
Return
COM_Init()
DllCall("SendMessageTimeout", "Uint", hIESvr, "Uint", DllCall("RegisterWindowMessage", "str", "WM_HTML_GETOBJECT"), "Uint", 0, "Uint", 0, "Uint", 2, "Uint", 1000, "UintP", lResult)
DllCall("oleacc\ObjectFromLresult", "Uint", lResult, "Uint", COM_GUID4String(IID_IHTMLDocument2,"{332C4425-26CB-11D0-B483-00C04FD90119}"), "int", 0, "UintP", pdoc)
IID_IWebBrowserApp := "{0002DF05-0000-0000-C000-000000000046}"
pweb := COM_QueryService(pdoc,IID_IWebBrowserApp,IID_IWebBrowserApp), COM_Release(pdoc)
Return pweb
}
;**********************Time to complete*********************************
MStoM(ms) { ; Convert Milliseconds to a string of minutes and seconds
Orig := A_FormatFloat ; Store previous Float format
SetFormat, Float, 0.1 ; One decimal place
m := ms / 1000 / 60 ; minutes
m := SubStr(m, 1, StrLen(m)-2) ; Remove decimal - No rounding for minutes!
SetFormat, Float, 0.0 ; No decimals for seconds!
s := (ms / 1000) - (m * 60) ; subtract minutes from total seconds
SetFormat, Float, %Orig% ; Restore previous Float format
Return m . "m " . s . "s" ; Return minutes and seconds as a string
}
;**********************Duplicate and word counter*********************************
DuplicateFinderAndCounter(String, TopCounts) {
Needle := "[\W]+" ; this is the story, I beleive if you were to change someting it would be this regex, or you can use a simple split or StringReplace/RegExReplace every white space with line feed
String:=RegExReplace(String, Needle, "`n") ; replace all non word strings with new lines
;**********************remove short words & words not want to track*********************************
StringLower,string,string
Loop,parse, string, `n
{
if StrLen(A_loopfield)=1
Continue
if A_loopfield not in as,are,up,or,not,the,that,this,is,in,your,more,from,what,for,of,and,to,use,on,can,by,www,http,with,hi,low,high,new,index,if,id,var
String2.= A_Loopfield "`n"
}
string:=String2
Sort, String ; sort the string
p:=1, needle := "im`n)^(.*)(\n\1)+`n"
while p:=RegExMatch(String, needle, duplicate, p+strlen(duplicate)){ ; search for consecutive same lines
StringReplace, s, duplicate, `n,, UseErrorLevel ; get the count of existing lines by using UseErrorLevel
Duplicates .= ErrorLevel A_Space duplicate1 "`n" ; add the count and the word
}
Duplicates:=trim(Duplicates, "`n")
Sort, Duplicates, RF SortingWithRegEx ; here we sort numerically, each either that, or we do it some other way...
if f := instr(Duplicates, "`n", false, 1, TopCounts) ; get for the tenth line feed, if there is at least 10
Duplicates:=substr(Duplicates, 1, f) ; return the top ten, if....
StringReplace, Duplicates, Duplicates, `n,|,all
stringtrimright,Duplicates, Duplicates, 1
return, Duplicates
}
SortingWithRegEx(a1, a2) {
RegExMatch(a1, "(^\d+)", f1)
RegExMatch(a2, "(^\d+)", f2)
return f1 > f2 ? -1 : 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment