Skip to content

Instantly share code, notes, and snippets.

@heldev
Last active July 19, 2019 17:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save heldev/24d4449b935248cca6a226fcf380e548 to your computer and use it in GitHub Desktop.
Save heldev/24d4449b935248cca6a226fcf380e548 to your computer and use it in GitHub Desktop.
Adds repositories and corresponding licenses to every line with a link of an awesome list, if it's not a GitHub link the script tries to find GitHub links in the page content
#!/usr/bin/env stack
-- stack script --resolver lts-13.27
-- How to use:
-- 1. Install stack `curl -sSL https://get.haskellstack.org/ | sh`
-- (alternative ways available here https://docs.haskellstack.org/en/stable/README/#how-to-install )
-- 2. Fill parameters like `gitHubClientId`, `gitHubClientSecret`, `readMeUrl` ...
-- (GitHub credentials can be created here https://github.com/settings/applications/new )
-- 3. Run the file `./this-file.hs`
-- 4. Review newly created README.md in your working directory
-- Possible issues:
-- * All licenses are `NA` - double check your gitHub credentials, if they are ok
-- wait for an hour or check your API call limits
-- * Some urls cause ConnectionTimeout or DNS resolution exceptions - the simplest
-- workaround is to add them to `noGoUrls`
{-# LANGUAGE OverloadedStrings #-}
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import Control.Concurrent.Async
( mapConcurrently
)
import Control.Monad
( mapM
)
import Data.Aeson
( Object
, (.:)
)
import Data.Aeson.Types
( parseMaybe
, Parser)
import Data.List
( (\\)
)
import Data.Maybe
( fromMaybe
)
import Data.Semigroup
( (<>)
)
import Data.Traversable
( sequence
)
import Debug.Trace
( traceIO
)
import GHC.IO.Handle
( Handle
)
import Network.HTTP.Client
( newManager
, defaultManagerSettings
)
import Network.HTTP.Simple
( getResponseBody
, httpJSON
, httpLbs
, parseRequest
, Request
, Response
, setRequestHeaders
, setRequestQueryString
)
import System.IO
( IOMode
( WriteMode
)
,withFile)
import Text.Regex.Base.RegexLike
( getAllTextMatches
)
import Text.Regex.TDFA
( (=~)
)
gitHubClientId = Just ""
gitHubClientSecret = Just ""
readMeUrl = "https://raw.githubusercontent.com/akullpp/awesome-java/master/README.md"
noGoUrls = ["https://awesome.re", "https://manifold.systems", "https://logback.qos.ch", "https://www.slf4j.org"]
concurrency = 128
data ListItem = ListItem
{ line :: L.ByteString
, repositoryToLicense :: Map.Map RepositoryFullName L.ByteString
} deriving Show
type RepositoryFullName = L.ByteString
gitHubLinkRegex = "https?://github.com/([[:alnum:]-]+/[[:alnum:].-]+)" :: L.ByteString
main :: IO ()
main =
withFile "README.md" WriteMode writeAwesomeListWithRemarks
writeAwesomeListWithRemarks :: Handle -> IO ()
writeAwesomeListWithRemarks handle =
getUrlContent readMeUrl >>= addRemarks >>= L.hPut handle
getUrlContent :: String -> IO L.ByteString
getUrlContent url = do
traceIO $ "Loading " ++ url
getResponseBody <$> (httpLbs =<< parseRequest url)
addRemarks :: L.ByteString -> IO L.ByteString
addRemarks content = do
listItemChunks <- sequence $ mapConcurrently toListItem <$> (chunk concurrency $ C.lines content)
return $ C.unlines $ render <$> concat listItemChunks
toListItem :: L.ByteString -> IO ListItem
toListItem line =
ListItem line <$> getRepositoriesAndLicenses line
getRepositoriesAndLicenses :: L.ByteString -> IO (Map.Map RepositoryFullName L.ByteString)
getRepositoriesAndLicenses line =
sequence =<<
Map.fromSet getLicense <$> Set.fromList <$> getRepositories line
getLicense :: RepositoryFullName -> IO L.ByteString
getLicense repository =
getLicenseKey <$> getRepository repository
getRepository :: RepositoryFullName -> IO Object
getRepository ownerAndRepo = do
request <- prepareRequest $ "https://api.github.com/repos/" <> ownerAndRepo
getResponseBody <$> httpJSON request
prepareRequest :: L.ByteString -> IO Request
prepareRequest url =
setRequestHeaders headers
<$> setRequestQueryString queryString
<$> (parseRequest $ C.unpack url)
where
headers =
[ ("User-Agent", "haskell-script")
]
queryString =
[ ("client_id", gitHubClientId)
, ("client_secret", gitHubClientSecret)
]
getLicenseKey :: Object -> L.ByteString
getLicenseKey repository =
C.pack $ fromMaybe "NA" $ parseMaybe getLicenseKeyParser repository
where
getLicenseKeyParser object =
object .: "license" >>= (.: "key")
getRepositories :: L.ByteString -> IO [RepositoryFullName]
getRepositories line
| line =~ gitHubLinkRegex =
return $ extractRepositories line
| otherwise = do
pageContents <- mapM (getUrlContent . C.unpack) $ extractUrls line
return $ extractRepositories =<< pageContents
extractUrls :: L.ByteString -> [L.ByteString]
extractUrls content =
(getAllTextMatches $ content =~ linkRegex) \\ noGoUrls
where
linkRegex = "https?://[^)[:space:]]+" :: L.ByteString
extractRepositories :: L.ByteString -> [RepositoryFullName]
extractRepositories content =
concatMap (drop 1) (content =~ gitHubLinkRegex :: [[L.ByteString]])
render :: ListItem -> L.ByteString
render listItem =
line listItem <> (Map.foldMapWithKey toRemark $ repositoryToLicense listItem)
where
toRemark key value = " (" <> key <> " " <> value <> ")"
chunk :: Int -> [a] -> [[a]]
chunk _ [] = []
chunk size list = [take size list] ++ (chunk size $ drop size list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment